diff --git b/Documentation/ABI/testing/debugfs-aufs b/Documentation/ABI/testing/debugfs-aufs
new file mode 100644
index 0000000..99642d1
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-aufs
@@ -0,0 +1,50 @@
+What:		/debug/aufs/si_<id>/
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		Under /debug/aufs, a directory named si_<id> is created
+		per aufs mount, where <id> is a unique id generated
+		internally.
+
+What:		/debug/aufs/si_<id>/plink
+Date:		Apr 2013
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It has three lines and shows the information about the
+		pseudo-link. The first line is a single number
+		representing a number of buckets. The second line is a
+		number of pseudo-links per buckets (separated by a
+		blank). The last line is a single number representing a
+		total number of psedo-links.
+		When the aufs mount option 'noplink' is specified, it
+		will show "1\n0\n0\n".
+
+What:		/debug/aufs/si_<id>/xib
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the consumed blocks by xib (External Inode Number
+		Bitmap), its block size and file size.
+		When the aufs mount option 'noxino' is specified, it
+		will be empty. About XINO files, see the aufs manual.
+
+What:		/debug/aufs/si_<id>/xino0, xino1 ... xinoN
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the consumed blocks by xino (External Inode Number
+		Translation Table), its link count, block size and file
+		size.
+		When the aufs mount option 'noxino' is specified, it
+		will be empty. About XINO files, see the aufs manual.
+
+What:		/debug/aufs/si_<id>/xigen
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the consumed blocks by xigen (External Inode
+		Generation Table), its block size and file size.
+		If CONFIG_AUFS_EXPORT is disabled, this entry will not
+		be created.
+		When the aufs mount option 'noxino' is specified, it
+		will be empty. About XINO files, see the aufs manual.
diff --git b/Documentation/ABI/testing/sysfs-aufs b/Documentation/ABI/testing/sysfs-aufs
new file mode 100644
index 0000000..82f9518
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-aufs
@@ -0,0 +1,31 @@
+What:		/sys/fs/aufs/si_<id>/
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		Under /sys/fs/aufs, a directory named si_<id> is created
+		per aufs mount, where <id> is a unique id generated
+		internally.
+
+What:		/sys/fs/aufs/si_<id>/br0, br1 ... brN
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the abolute path of a member directory (which
+		is called branch) in aufs, and its permission.
+
+What:		/sys/fs/aufs/si_<id>/brid0, brid1 ... bridN
+Date:		July 2013
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the id of a member directory (which is called
+		branch) in aufs.
+
+What:		/sys/fs/aufs/si_<id>/xi_path
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the abolute path of XINO (External Inode Number
+		Bitmap, Translation Table and Generation Table) file
+		even if it is the default path.
+		When the aufs mount option 'noxino' is specified, it
+		will be empty. About XINO files, see the aufs manual.
diff --git b/Documentation/filesystems/aufs/README b/Documentation/filesystems/aufs/README
new file mode 100644
index 0000000..36df674
--- /dev/null
+++ b/Documentation/filesystems/aufs/README
@@ -0,0 +1,392 @@
+
+Aufs4 -- advanced multi layered unification filesystem version 4.x
+http://aufs.sf.net
+Junjiro R. Okajima
+
+
+0. Introduction
+----------------------------------------
+In the early days, aufs was entirely re-designed and re-implemented
+Unionfs Version 1.x series. Adding many original ideas, approaches,
+improvements and implementations, it becomes totally different from
+Unionfs while keeping the basic features.
+Recently, Unionfs Version 2.x series begin taking some of the same
+approaches to aufs1's.
+Unionfs is being developed by Professor Erez Zadok at Stony Brook
+University and his team.
+
+Aufs4 supports linux-4.0 and later, and for linux-3.x series try aufs3.
+If you want older kernel version support, try aufs2-2.6.git or
+aufs2-standalone.git repository, aufs1 from CVS on SourceForge.
+
+Note: it becomes clear that "Aufs was rejected. Let's give it up."
+      According to Christoph Hellwig, linux rejects all union-type
+      filesystems but UnionMount.
+<http://marc.info/?l=linux-kernel&m=123938533724484&w=2>
+
+PS. Al Viro seems have a plan to merge aufs as well as overlayfs and
+    UnionMount, and he pointed out an issue around a directory mutex
+    lock and aufs addressed it. But it is still unsure whether aufs will
+    be merged (or any other union solution).
+<http://marc.info/?l=linux-kernel&m=136312705029295&w=1>
+
+
+1. Features
+----------------------------------------
+- unite several directories into a single virtual filesystem. The member
+  directory is called as a branch.
+- you can specify the permission flags to the branch, which are 'readonly',
+  'readwrite' and 'whiteout-able.'
+- by upper writable branch, internal copyup and whiteout, files/dirs on
+  readonly branch are modifiable logically.
+- dynamic branch manipulation, add, del.
+- etc...
+
+Also there are many enhancements in aufs, such as:
+- test only the highest one for the directory permission (dirperm1)
+- copyup on open (coo=)
+- 'move' policy for copy-up between two writable branches, after
+  checking free space.
+- xattr, acl
+- readdir(3) in userspace.
+- keep inode number by external inode number table
+- keep the timestamps of file/dir in internal copyup operation
+- seekable directory, supporting NFS readdir.
+- whiteout is hardlinked in order to reduce the consumption of inodes
+  on branch
+- do not copyup, nor create a whiteout when it is unnecessary
+- revert a single systemcall when an error occurs in aufs
+- remount interface instead of ioctl
+- maintain /etc/mtab by an external command, /sbin/mount.aufs.
+- loopback mounted filesystem as a branch
+- kernel thread for removing the dir who has a plenty of whiteouts
+- support copyup sparse file (a file which has a 'hole' in it)
+- default permission flags for branches
+- selectable permission flags for ro branch, whether whiteout can
+  exist or not
+- export via NFS.
+- support <sysfs>/fs/aufs and <debugfs>/aufs.
+- support multiple writable branches, some policies to select one
+  among multiple writable branches.
+- a new semantics for link(2) and rename(2) to support multiple
+  writable branches.
+- no glibc changes are required.
+- pseudo hardlink (hardlink over branches)
+- allow a direct access manually to a file on branch, e.g. bypassing aufs.
+  including NFS or remote filesystem branch.
+- userspace wrapper for pathconf(3)/fpathconf(3) with _PC_LINK_MAX.
+- and more...
+
+Currently these features are dropped temporary from aufs4.
+See design/08plan.txt in detail.
+- nested mount, i.e. aufs as readonly no-whiteout branch of another aufs
+  (robr)
+- statistics of aufs thread (/sys/fs/aufs/stat)
+
+Features or just an idea in the future (see also design/*.txt),
+- reorder the branch index without del/re-add.
+- permanent xino files for NFSD
+- an option for refreshing the opened files after add/del branches
+- light version, without branch manipulation. (unnecessary?)
+- copyup in userspace
+- inotify in userspace
+- readv/writev
+
+
+2. Download
+----------------------------------------
+There are three GIT trees for aufs4, aufs4-linux.git,
+aufs4-standalone.git, and aufs-util.git. Note that there is no "4" in
+"aufs-util.git."
+While the aufs-util is always necessary, you need either of aufs4-linux
+or aufs4-standalone.
+
+The aufs4-linux tree includes the whole linux mainline GIT tree,
+git://git.kernel.org/.../torvalds/linux.git.
+And you cannot select CONFIG_AUFS_FS=m for this version, eg. you cannot
+build aufs4 as an external kernel module.
+Several extra patches are not included in this tree. Only
+aufs4-standalone tree contains them. They are described in the later
+section "Configuration and Compilation."
+
+On the other hand, the aufs4-standalone tree has only aufs source files
+and necessary patches, and you can select CONFIG_AUFS_FS=m.
+But you need to apply all aufs patches manually.
+
+You will find GIT branches whose name is in form of "aufs4.x" where "x"
+represents the linux kernel version, "linux-4.x". For instance,
+"aufs4.0" is for linux-4.0. For latest "linux-4.x-rcN", use
+"aufs4.x-rcN" branch.
+
+o aufs4-linux tree
+$ git clone --reference /your/linux/git/tree \
+	git://github.com/sfjro/aufs4-linux.git aufs4-linux.git
+- if you don't have linux GIT tree, then remove "--reference ..."
+$ cd aufs4-linux.git
+$ git checkout origin/aufs4.0
+
+Or You may want to directly git-pull aufs into your linux GIT tree, and
+leave the patch-work to GIT.
+$ cd /your/linux/git/tree
+$ git remote add aufs4 git://github.com/sfjro/aufs4-linux.git
+$ git fetch aufs4
+$ git checkout -b my4.0 v4.0
+$ (add your local change...)
+$ git pull aufs4 aufs4.0
+- now you have v4.0 + your_changes + aufs4.0 in you my4.0 branch.
+- you may need to solve some conflicts between your_changes and
+  aufs4.0. in this case, git-rerere is recommended so that you can
+  solve the similar conflicts automatically when you upgrade to 4.1 or
+  later in the future.
+
+o aufs4-standalone tree
+$ git clone git://github.com/sfjro/aufs4-standalone.git aufs4-standalone.git
+$ cd aufs4-standalone.git
+$ git checkout origin/aufs4.0
+
+o aufs-util tree
+$ git clone git://git.code.sf.net/p/aufs/aufs-util aufs-util.git
+- note that the public aufs-util.git is on SourceForge instead of
+  GitHUB.
+$ cd aufs-util.git
+$ git checkout origin/aufs4.0
+
+Note: The 4.x-rcN branch is to be used with `rc' kernel versions ONLY.
+The minor version number, 'x' in '4.x', of aufs may not always
+follow the minor version number of the kernel.
+Because changes in the kernel that cause the use of a new
+minor version number do not always require changes to aufs-util.
+
+Since aufs-util has its own minor version number, you may not be
+able to find a GIT branch in aufs-util for your kernel's
+exact minor version number.
+In this case, you should git-checkout the branch for the
+nearest lower number.
+
+For (an unreleased) example:
+If you are using "linux-4.10" and the "aufs4.10" branch
+does not exist in aufs-util repository, then "aufs4.9", "aufs4.8"
+or something numerically smaller is the branch for your kernel.
+
+Also you can view all branches by
+	$ git branch -a
+
+
+3. Configuration and Compilation
+----------------------------------------
+Make sure you have git-checkout'ed the correct branch.
+
+For aufs4-linux tree,
+- enable CONFIG_AUFS_FS.
+- set other aufs configurations if necessary.
+
+For aufs4-standalone tree,
+There are several ways to build.
+
+1.
+- apply ./aufs4-kbuild.patch to your kernel source files.
+- apply ./aufs4-base.patch too.
+- apply ./aufs4-mmap.patch too.
+- apply ./aufs4-standalone.patch too, if you have a plan to set
+  CONFIG_AUFS_FS=m. otherwise you don't need ./aufs4-standalone.patch.
+- copy ./{Documentation,fs,include/uapi/linux/aufs_type.h} files to your
+  kernel source tree. Never copy $PWD/include/uapi/linux/Kbuild.
+- enable CONFIG_AUFS_FS, you can select either
+  =m or =y.
+- and build your kernel as usual.
+- install the built kernel.
+  Note: Since linux-3.9, every filesystem module requires an alias
+  "fs-<fsname>". You should make sure that "fs-aufs" is listed in your
+  modules.aliases file if you set CONFIG_AUFS_FS=m.
+- install the header files too by "make headers_install" to the
+  directory where you specify. By default, it is $PWD/usr.
+  "make help" shows a brief note for headers_install.
+- and reboot your system.
+
+2.
+- module only (CONFIG_AUFS_FS=m).
+- apply ./aufs4-base.patch to your kernel source files.
+- apply ./aufs4-mmap.patch too.
+- apply ./aufs4-standalone.patch too.
+- build your kernel, don't forget "make headers_install", and reboot.
+- edit ./config.mk and set other aufs configurations if necessary.
+  Note: You should read $PWD/fs/aufs/Kconfig carefully which describes
+  every aufs configurations.
+- build the module by simple "make".
+  Note: Since linux-3.9, every filesystem module requires an alias
+  "fs-<fsname>". You should make sure that "fs-aufs" is listed in your
+  modules.aliases file.
+- you can specify ${KDIR} make variable which points to your kernel
+  source tree.
+- install the files
+  + run "make install" to install the aufs module, or copy the built
+    $PWD/aufs.ko to /lib/modules/... and run depmod -a (or reboot simply).
+  + run "make install_headers" (instead of headers_install) to install
+    the modified aufs header file (you can specify DESTDIR which is
+    available in aufs standalone version's Makefile only), or copy
+    $PWD/usr/include/linux/aufs_type.h to /usr/include/linux or wherever
+    you like manually. By default, the target directory is $PWD/usr.
+- no need to apply aufs4-kbuild.patch, nor copying source files to your
+  kernel source tree.
+
+Note: The header file aufs_type.h is necessary to build aufs-util
+      as well as "make headers_install" in the kernel source tree.
+      headers_install is subject to be forgotten, but it is essentially
+      necessary, not only for building aufs-util.
+      You may not meet problems without headers_install in some older
+      version though.
+
+And then,
+- read README in aufs-util, build and install it
+- note that your distribution may contain an obsoleted version of
+  aufs_type.h in /usr/include/linux or something. When you build aufs
+  utilities, make sure that your compiler refers the correct aufs header
+  file which is built by "make headers_install."
+- if you want to use readdir(3) in userspace or pathconf(3) wrapper,
+  then run "make install_ulib" too. And refer to the aufs manual in
+  detail.
+
+There several other patches in aufs4-standalone.git. They are all
+optional. When you meet some problems, they will help you.
+- aufs4-loopback.patch
+  Supports a nested loopback mount in a branch-fs. This patch is
+  unnecessary until aufs produces a message like "you may want to try
+  another patch for loopback file".
+- vfs-ino.patch
+  Modifies a system global kernel internal function get_next_ino() in
+  order to stop assigning 0 for an inode-number. Not directly related to
+  aufs, but recommended generally.
+- tmpfs-idr.patch
+  Keeps the tmpfs inode number as the lowest value. Effective to reduce
+  the size of aufs XINO files for tmpfs branch. Also it prevents the
+  duplication of inode number, which is important for backup tools and
+  other utilities. When you find aufs XINO files for tmpfs branch
+  growing too much, try this patch.
+- lockdep-debug.patch
+  Because aufs is not only an ordinary filesystem (callee of VFS), but
+  also a caller of VFS functions for branch filesystems, subclassing of
+  the internal locks for LOCKDEP is necessary. LOCKDEP is a debugging
+  feature of linux kernel. If you enable CONFIG_LOCKDEP, then you will
+  need to apply this debug patch to expand several constant values.
+  If don't know what LOCKDEP, then you don't have apply this patch.
+
+
+4. Usage
+----------------------------------------
+At first, make sure aufs-util are installed, and please read the aufs
+manual, aufs.5 in aufs-util.git tree.
+$ man -l aufs.5
+
+And then,
+$ mkdir /tmp/rw /tmp/aufs
+# mount -t aufs -o br=/tmp/rw:${HOME} none /tmp/aufs
+
+Here is another example. The result is equivalent.
+# mount -t aufs -o br=/tmp/rw=rw:${HOME}=ro none /tmp/aufs
+  Or
+# mount -t aufs -o br:/tmp/rw none /tmp/aufs
+# mount -o remount,append:${HOME} /tmp/aufs
+
+Then, you can see whole tree of your home dir through /tmp/aufs. If
+you modify a file under /tmp/aufs, the one on your home directory is
+not affected, instead the same named file will be newly created under
+/tmp/rw. And all of your modification to a file will be applied to
+the one under /tmp/rw. This is called the file based Copy on Write
+(COW) method.
+Aufs mount options are described in aufs.5.
+If you run chroot or something and make your aufs as a root directory,
+then you need to customize the shutdown script. See the aufs manual in
+detail.
+
+Additionally, there are some sample usages of aufs which are a
+diskless system with network booting, and LiveCD over NFS.
+See sample dir in CVS tree on SourceForge.
+
+
+5. Contact
+----------------------------------------
+When you have any problems or strange behaviour in aufs, please let me
+know with:
+- /proc/mounts (instead of the output of mount(8))
+- /sys/module/aufs/*
+- /sys/fs/aufs/* (if you have them)
+- /debug/aufs/* (if you have them)
+- linux kernel version
+  if your kernel is not plain, for example modified by distributor,
+  the url where i can download its source is necessary too.
+- aufs version which was printed at loading the module or booting the
+  system, instead of the date you downloaded.
+- configuration (define/undefine CONFIG_AUFS_xxx)
+- kernel configuration or /proc/config.gz (if you have it)
+- behaviour which you think to be incorrect
+- actual operation, reproducible one is better
+- mailto: aufs-users at lists.sourceforge.net
+
+Usually, I don't watch the Public Areas(Bugs, Support Requests, Patches,
+and Feature Requests) on SourceForge. Please join and write to
+aufs-users ML.
+
+
+6. Acknowledgements
+----------------------------------------
+Thanks to everyone who have tried and are using aufs, whoever
+have reported a bug or any feedback.
+
+Especially donators:
+Tomas Matejicek(slax.org) made a donation (much more than once).
+	Since Apr 2010, Tomas M (the author of Slax and Linux Live
+	scripts) is making "doubling" donations.
+	Unfortunately I cannot list all of the donators, but I really
+	appreciate.
+	It ends Aug 2010, but the ordinary donation URL is still available.
+	<http://sourceforge.net/donate/index.php?group_id=167503>
+Dai Itasaka made a donation (2007/8).
+Chuck Smith made a donation (2008/4, 10 and 12).
+Henk Schoneveld made a donation (2008/9).
+Chih-Wei Huang, ASUS, CTC donated Eee PC 4G (2008/10).
+Francois Dupoux made a donation (2008/11).
+Bruno Cesar Ribas and Luis Carlos Erpen de Bona, C3SL serves public
+	aufs2 GIT tree (2009/2).
+William Grant made a donation (2009/3).
+Patrick Lane made a donation (2009/4).
+The Mail Archive (mail-archive.com) made donations (2009/5).
+Nippy Networks (Ed Wildgoose) made a donation (2009/7).
+New Dream Network, LLC (www.dreamhost.com) made a donation (2009/11).
+Pavel Pronskiy made a donation (2011/2).
+Iridium and Inmarsat satellite phone retailer (www.mailasail.com), Nippy
+	Networks (Ed Wildgoose) made a donation for hardware (2011/3).
+Max Lekomcev (DOM-TV project) made a donation (2011/7, 12, 2012/3, 6 and
+11).
+Sam Liddicott made a donation (2011/9).
+Era Scarecrow made a donation (2013/4).
+Bor Ratajc made a donation (2013/4).
+Alessandro Gorreta made a donation (2013/4).
+POIRETTE Marc made a donation (2013/4).
+Alessandro Gorreta made a donation (2013/4).
+lauri kasvandik made a donation (2013/5).
+"pemasu from Finland" made a donation (2013/7).
+The Parted Magic Project made a donation (2013/9 and 11).
+Pavel Barta made a donation (2013/10).
+Nikolay Pertsev made a donation (2014/5).
+James B made a donation (2014/7 and 2015/7).
+Stefano Di Biase made a donation (2014/8).
+Daniel Epellei made a donation (2015/1).
+OmegaPhil made a donation (2016/1).
+Tomasz Szewczyk made a donation (2016/4).
+
+Thank you very much.
+Donations are always, including future donations, very important and
+helpful for me to keep on developing aufs.
+
+
+7.
+----------------------------------------
+If you are an experienced user, no explanation is needed. Aufs is
+just a linux filesystem.
+
+
+Enjoy!
+
+# Local variables: ;
+# mode: text;
+# End: ;
diff --git b/Documentation/filesystems/aufs/design/01intro.txt b/Documentation/filesystems/aufs/design/01intro.txt
new file mode 100644
index 0000000..5d01214
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/01intro.txt
@@ -0,0 +1,157 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Introduction
+----------------------------------------
+
+aufs [ei ju: ef es] | [a u f s]
+1. abbrev. for "advanced multi-layered unification filesystem".
+2. abbrev. for "another unionfs".
+3. abbrev. for "auf das" in German which means "on the" in English.
+   Ex. "Butter aufs Brot"(G) means "butter onto bread"(E).
+   But "Filesystem aufs Filesystem" is hard to understand.
+
+AUFS is a filesystem with features:
+- multi layered stackable unification filesystem, the member directory
+  is called as a branch.
+- branch permission and attribute, 'readonly', 'real-readonly',
+  'readwrite', 'whiteout-able', 'link-able whiteout', etc. and their
+  combination.
+- internal "file copy-on-write".
+- logical deletion, whiteout.
+- dynamic branch manipulation, adding, deleting and changing permission.
+- allow bypassing aufs, user's direct branch access.
+- external inode number translation table and bitmap which maintains the
+  persistent aufs inode number.
+- seekable directory, including NFS readdir.
+- file mapping, mmap and sharing pages.
+- pseudo-link, hardlink over branches.
+- loopback mounted filesystem as a branch.
+- several policies to select one among multiple writable branches.
+- revert a single systemcall when an error occurs in aufs.
+- and more...
+
+
+Multi Layered Stackable Unification Filesystem
+----------------------------------------------------------------------
+Most people already knows what it is.
+It is a filesystem which unifies several directories and provides a
+merged single directory. When users access a file, the access will be
+passed/re-directed/converted (sorry, I am not sure which English word is
+correct) to the real file on the member filesystem. The member
+filesystem is called 'lower filesystem' or 'branch' and has a mode
+'readonly' and 'readwrite.' And the deletion for a file on the lower
+readonly branch is handled by creating 'whiteout' on the upper writable
+branch.
+
+On LKML, there have been discussions about UnionMount (Jan Blunck,
+Bharata B Rao and Valerie Aurora) and Unionfs (Erez Zadok). They took
+different approaches to implement the merged-view.
+The former tries putting it into VFS, and the latter implements as a
+separate filesystem.
+(If I misunderstand about these implementations, please let me know and
+I shall correct it. Because it is a long time ago when I read their
+source files last time).
+
+UnionMount's approach will be able to small, but may be hard to share
+branches between several UnionMount since the whiteout in it is
+implemented in the inode on branch filesystem and always
+shared. According to Bharata's post, readdir does not seems to be
+finished yet.
+There are several missing features known in this implementations such as
+- for users, the inode number may change silently. eg. copy-up.
+- link(2) may break by copy-up.
+- read(2) may get an obsoleted filedata (fstat(2) too).
+- fcntl(F_SETLK) may be broken by copy-up.
+- unnecessary copy-up may happen, for example mmap(MAP_PRIVATE) after
+  open(O_RDWR).
+
+In linux-3.18, "overlay" filesystem (formerly known as "overlayfs") was
+merged into mainline. This is another implementation of UnionMount as a
+separated filesystem. All the limitations and known problems which
+UnionMount are equally inherited to "overlay" filesystem.
+
+Unionfs has a longer history. When I started implementing a stackable
+filesystem (Aug 2005), it already existed. It has virtual super_block,
+inode, dentry and file objects and they have an array pointing lower
+same kind objects. After contributing many patches for Unionfs, I
+re-started my project AUFS (Jun 2006).
+
+In AUFS, the structure of filesystem resembles to Unionfs, but I
+implemented my own ideas, approaches and enhancements and it became
+totally different one.
+
+Comparing DM snapshot and fs based implementation
+- the number of bytes to be copied between devices is much smaller.
+- the type of filesystem must be one and only.
+- the fs must be writable, no readonly fs, even for the lower original
+  device. so the compression fs will not be usable. but if we use
+  loopback mount, we may address this issue.
+  for instance,
+	mount /cdrom/squashfs.img /sq
+	losetup /sq/ext2.img
+	losetup /somewhere/cow
+	dmsetup "snapshot /dev/loop0 /dev/loop1 ..."
+- it will be difficult (or needs more operations) to extract the
+  difference between the original device and COW.
+- DM snapshot-merge may help a lot when users try merging. in the
+  fs-layer union, users will use rsync(1).
+
+You may want to read my old paper "Filesystems in LiveCD"
+(http://aufs.sourceforge.net/aufs2/report/sq/sq.pdf).
+
+
+Several characters/aspects/persona of aufs
+----------------------------------------------------------------------
+
+Aufs has several characters, aspects or persona.
+1. a filesystem, callee of VFS helper
+2. sub-VFS, caller of VFS helper for branches
+3. a virtual filesystem which maintains persistent inode number
+4. reader/writer of files on branches such like an application
+
+1. Callee of VFS Helper
+As an ordinary linux filesystem, aufs is a callee of VFS. For instance,
+unlink(2) from an application reaches sys_unlink() kernel function and
+then vfs_unlink() is called. vfs_unlink() is one of VFS helper and it
+calls filesystem specific unlink operation. Actually aufs implements the
+unlink operation but it behaves like a redirector.
+
+2. Caller of VFS Helper for Branches
+aufs_unlink() passes the unlink request to the branch filesystem as if
+it were called from VFS. So the called unlink operation of the branch
+filesystem acts as usual. As a caller of VFS helper, aufs should handle
+every necessary pre/post operation for the branch filesystem.
+- acquire the lock for the parent dir on a branch
+- lookup in a branch
+- revalidate dentry on a branch
+- mnt_want_write() for a branch
+- vfs_unlink() for a branch
+- mnt_drop_write() for a branch
+- release the lock on a branch
+
+3. Persistent Inode Number
+One of the most important issue for a filesystem is to maintain inode
+numbers. This is particularly important to support exporting a
+filesystem via NFS. Aufs is a virtual filesystem which doesn't have a
+backend block device for its own. But some storage is necessary to
+keep and maintain the inode numbers. It may be a large space and may not
+suit to keep in memory. Aufs rents some space from its first writable
+branch filesystem (by default) and creates file(s) on it. These files
+are created by aufs internally and removed soon (currently) keeping
+opened.
+Note: Because these files are removed, they are totally gone after
+      unmounting aufs. It means the inode numbers are not persistent
+      across unmount or reboot. I have a plan to make them really
+      persistent which will be important for aufs on NFS server.
+
+4. Read/Write Files Internally (copy-on-write)
+Because a branch can be readonly, when you write a file on it, aufs will
+"copy-up" it to the upper writable branch internally. And then write the
+originally requested thing to the file. Generally kernel doesn't
+open/read/write file actively. In aufs, even a single write may cause a
+internal "file copy". This behaviour is very similar to cp(1) command.
+
+Some people may think it is better to pass such work to user space
+helper, instead of doing in kernel space. Actually I am still thinking
+about it. But currently I have implemented it in kernel space.
diff --git b/Documentation/filesystems/aufs/design/02struct.txt b/Documentation/filesystems/aufs/design/02struct.txt
new file mode 100644
index 0000000..783328a
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/02struct.txt
@@ -0,0 +1,245 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Basic Aufs Internal Structure
+
+Superblock/Inode/Dentry/File Objects
+----------------------------------------------------------------------
+As like an ordinary filesystem, aufs has its own
+superblock/inode/dentry/file objects. All these objects have a
+dynamically allocated array and store the same kind of pointers to the
+lower filesystem, branch.
+For example, when you build a union with one readwrite branch and one
+readonly, mounted /au, /rw and /ro respectively.
+- /au = /rw + /ro
+- /ro/fileA exists but /rw/fileA
+
+Aufs lookup operation finds /ro/fileA and gets dentry for that. These
+pointers are stored in a aufs dentry. The array in aufs dentry will be,
+- [0] = NULL (because /rw/fileA doesn't exist)
+- [1] = /ro/fileA
+
+This style of an array is essentially same to the aufs
+superblock/inode/dentry/file objects.
+
+Because aufs supports manipulating branches, ie. add/delete/change
+branches dynamically, these objects has its own generation. When
+branches are changed, the generation in aufs superblock is
+incremented. And a generation in other object are compared when it is
+accessed. When a generation in other objects are obsoleted, aufs
+refreshes the internal array.
+
+
+Superblock
+----------------------------------------------------------------------
+Additionally aufs superblock has some data for policies to select one
+among multiple writable branches, XIB files, pseudo-links and kobject.
+See below in detail.
+About the policies which supports copy-down a directory, see
+wbr_policy.txt too.
+
+
+Branch and XINO(External Inode Number Translation Table)
+----------------------------------------------------------------------
+Every branch has its own xino (external inode number translation table)
+file. The xino file is created and unlinked by aufs internally. When two
+members of a union exist on the same filesystem, they share the single
+xino file.
+The struct of a xino file is simple, just a sequence of aufs inode
+numbers which is indexed by the lower inode number.
+In the above sample, assume the inode number of /ro/fileA is i111 and
+aufs assigns the inode number i999 for fileA. Then aufs writes 999 as
+4(8) bytes at 111 * 4(8) bytes offset in the xino file.
+
+When the inode numbers are not contiguous, the xino file will be sparse
+which has a hole in it and doesn't consume as much disk space as it
+might appear. If your branch filesystem consumes disk space for such
+holes, then you should specify 'xino=' option at mounting aufs.
+
+Aufs has a mount option to free the disk blocks for such holes in XINO
+files on tmpfs or ramdisk. But it is not so effective actually. If you
+meet a problem of disk shortage due to XINO files, then you should try
+"tmpfs-ino.patch" (and "vfs-ino.patch" too) in aufs4-standalone.git.
+The patch localizes the assignment inumbers per tmpfs-mount and avoid
+the holes in XINO files.
+
+Also a writable branch has three kinds of "whiteout bases". All these
+are existed when the branch is joined to aufs, and their names are
+whiteout-ed doubly, so that users will never see their names in aufs
+hierarchy.
+1. a regular file which will be hardlinked to all whiteouts.
+2. a directory to store a pseudo-link.
+3. a directory to store an "orphan"-ed file temporary.
+
+1. Whiteout Base
+   When you remove a file on a readonly branch, aufs handles it as a
+   logical deletion and creates a whiteout on the upper writable branch
+   as a hardlink of this file in order not to consume inode on the
+   writable branch.
+2. Pseudo-link Dir
+   See below, Pseudo-link.
+3. Step-Parent Dir
+   When "fileC" exists on the lower readonly branch only and it is
+   opened and removed with its parent dir, and then user writes
+   something into it, then aufs copies-up fileC to this
+   directory. Because there is no other dir to store fileC. After
+   creating a file under this dir, the file is unlinked.
+
+Because aufs supports manipulating branches, ie. add/delete/change
+dynamically, a branch has its own id. When the branch order changes,
+aufs finds the new index by searching the branch id.
+
+
+Pseudo-link
+----------------------------------------------------------------------
+Assume "fileA" exists on the lower readonly branch only and it is
+hardlinked to "fileB" on the branch. When you write something to fileA,
+aufs copies-up it to the upper writable branch. Additionally aufs
+creates a hardlink under the Pseudo-link Directory of the writable
+branch. The inode of a pseudo-link is kept in aufs super_block as a
+simple list. If fileB is read after unlinking fileA, aufs returns
+filedata from the pseudo-link instead of the lower readonly
+branch. Because the pseudo-link is based upon the inode, to keep the
+inode number by xino (see above) is essentially necessary.
+
+All the hardlinks under the Pseudo-link Directory of the writable branch
+should be restored in a proper location later. Aufs provides a utility
+to do this. The userspace helpers executed at remounting and unmounting
+aufs by default.
+During this utility is running, it puts aufs into the pseudo-link
+maintenance mode. In this mode, only the process which began the
+maintenance mode (and its child processes) is allowed to operate in
+aufs. Some other processes which are not related to the pseudo-link will
+be allowed to run too, but the rest have to return an error or wait
+until the maintenance mode ends. If a process already acquires an inode
+mutex (in VFS), it has to return an error.
+
+
+XIB(external inode number bitmap)
+----------------------------------------------------------------------
+Addition to the xino file per a branch, aufs has an external inode number
+bitmap in a superblock object. It is also an internal file such like a
+xino file.
+It is a simple bitmap to mark whether the aufs inode number is in-use or
+not.
+To reduce the file I/O, aufs prepares a single memory page to cache xib.
+
+As well as XINO files, aufs has a feature to truncate/refresh XIB to
+reduce the number of consumed disk blocks for these files.
+
+
+Virtual or Vertical Dir, and Readdir in Userspace
+----------------------------------------------------------------------
+In order to support multiple layers (branches), aufs readdir operation
+constructs a virtual dir block on memory. For readdir, aufs calls
+vfs_readdir() internally for each dir on branches, merges their entries
+with eliminating the whiteout-ed ones, and sets it to file (dir)
+object. So the file object has its entry list until it is closed. The
+entry list will be updated when the file position is zero and becomes
+obsoleted. This decision is made in aufs automatically.
+
+The dynamically allocated memory block for the name of entries has a
+unit of 512 bytes (by default) and stores the names contiguously (no
+padding). Another block for each entry is handled by kmem_cache too.
+During building dir blocks, aufs creates hash list and judging whether
+the entry is whiteouted by its upper branch or already listed.
+The merged result is cached in the corresponding inode object and
+maintained by a customizable life-time option.
+
+Some people may call it can be a security hole or invite DoS attack
+since the opened and once readdir-ed dir (file object) holds its entry
+list and becomes a pressure for system memory. But I'd say it is similar
+to files under /proc or /sys. The virtual files in them also holds a
+memory page (generally) while they are opened. When an idea to reduce
+memory for them is introduced, it will be applied to aufs too.
+For those who really hate this situation, I've developed readdir(3)
+library which operates this merging in userspace. You just need to set
+LD_PRELOAD environment variable, and aufs will not consume no memory in
+kernel space for readdir(3).
+
+
+Workqueue
+----------------------------------------------------------------------
+Aufs sometimes requires privilege access to a branch. For instance,
+in copy-up/down operation. When a user process is going to make changes
+to a file which exists in the lower readonly branch only, and the mode
+of one of ancestor directories may not be writable by a user
+process. Here aufs copy-up the file with its ancestors and they may
+require privilege to set its owner/group/mode/etc.
+This is a typical case of a application character of aufs (see
+Introduction).
+
+Aufs uses workqueue synchronously for this case. It creates its own
+workqueue. The workqueue is a kernel thread and has privilege. Aufs
+passes the request to call mkdir or write (for example), and wait for
+its completion. This approach solves a problem of a signal handler
+simply.
+If aufs didn't adopt the workqueue and changed the privilege of the
+process, then the process may receive the unexpected SIGXFSZ or other
+signals.
+
+Also aufs uses the system global workqueue ("events" kernel thread) too
+for asynchronous tasks, such like handling inotify/fsnotify, re-creating a
+whiteout base and etc. This is unrelated to a privilege.
+Most of aufs operation tries acquiring a rw_semaphore for aufs
+superblock at the beginning, at the same time waits for the completion
+of all queued asynchronous tasks.
+
+
+Whiteout
+----------------------------------------------------------------------
+The whiteout in aufs is very similar to Unionfs's. That is represented
+by its filename. UnionMount takes an approach of a file mode, but I am
+afraid several utilities (find(1) or something) will have to support it.
+
+Basically the whiteout represents "logical deletion" which stops aufs to
+lookup further, but also it represents "dir is opaque" which also stop
+further lookup.
+
+In aufs, rmdir(2) and rename(2) for dir uses whiteout alternatively.
+In order to make several functions in a single systemcall to be
+revertible, aufs adopts an approach to rename a directory to a temporary
+unique whiteouted name.
+For example, in rename(2) dir where the target dir already existed, aufs
+renames the target dir to a temporary unique whiteouted name before the
+actual rename on a branch, and then handles other actions (make it opaque,
+update the attributes, etc). If an error happens in these actions, aufs
+simply renames the whiteouted name back and returns an error. If all are
+succeeded, aufs registers a function to remove the whiteouted unique
+temporary name completely and asynchronously to the system global
+workqueue.
+
+
+Copy-up
+----------------------------------------------------------------------
+It is a well-known feature or concept.
+When user modifies a file on a readonly branch, aufs operate "copy-up"
+internally and makes change to the new file on the upper writable branch.
+When the trigger systemcall does not update the timestamps of the parent
+dir, aufs reverts it after copy-up.
+
+
+Move-down (aufs3.9 and later)
+----------------------------------------------------------------------
+"Copy-up" is one of the essential feature in aufs. It copies a file from
+the lower readonly branch to the upper writable branch when a user
+changes something about the file.
+"Move-down" is an opposite action of copy-up. Basically this action is
+ran manually instead of automatically and internally.
+For desgin and implementation, aufs has to consider these issues.
+- whiteout for the file may exist on the lower branch.
+- ancestor directories may not exist on the lower branch.
+- diropq for the ancestor directories may exist on the upper branch.
+- free space on the lower branch will reduce.
+- another access to the file may happen during moving-down, including
+  UDBA (see "Revalidate Dentry and UDBA").
+- the file should not be hard-linked nor pseudo-linked. they should be
+  handled by auplink utility later.
+
+Sometimes users want to move-down a file from the upper writable branch
+to the lower readonly or writable branch. For instance,
+- the free space of the upper writable branch is going to run out.
+- create a new intermediate branch between the upper and lower branch.
+- etc.
+
+For this purpose, use "aumvdown" command in aufs-util.git.
diff --git b/Documentation/filesystems/aufs/design/03atomic_open.txt b/Documentation/filesystems/aufs/design/03atomic_open.txt
new file mode 100644
index 0000000..741ad6d
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/03atomic_open.txt
@@ -0,0 +1,72 @@
+
+# Copyright (C) 2015-2016 Junjiro R. Okajima
+
+Support for a branch who has its ->atomic_open()
+----------------------------------------------------------------------
+The filesystems who implement its ->atomic_open() are not majority. For
+example NFSv4 does, and aufs should call NFSv4 ->atomic_open,
+particularly for open(O_CREAT|O_EXCL, 0400) case. Other than
+->atomic_open(), NFSv4 returns an error for this open(2). While I am not
+sure whether all filesystems who have ->atomic_open() behave like this,
+but NFSv4 surely returns the error.
+
+In order to support ->atomic_open() for aufs, there are a few
+approaches.
+
+A. Introduce aufs_atomic_open()
+   - calls one of VFS:do_last(), lookup_open() or atomic_open() for
+     branch fs.
+B. Introduce aufs_atomic_open() calling create, open and chmod. this is
+   an aufs user Pip Cet's approach
+   - calls aufs_create(), VFS finish_open() and notify_change().
+   - pass fake-mode to finish_open(), and then correct the mode by
+     notify_change().
+C. Extend aufs_open() to call branch fs's ->atomic_open()
+   - no aufs_atomic_open().
+   - aufs_lookup() registers the TID to an aufs internal object.
+   - aufs_create() does nothing when the matching TID is registered, but
+     registers the mode.
+   - aufs_open() calls branch fs's ->atomic_open() when the matching
+     TID is registered.
+D. Extend aufs_open() to re-try branch fs's ->open() with superuser's
+   credential
+   - no aufs_atomic_open().
+   - aufs_create() registers the TID to an internal object. this info
+     represents "this process created this file just now."
+   - when aufs gets EACCES from branch fs's ->open(), then confirm the
+     registered TID and re-try open() with superuser's credential.
+
+Pros and cons for each approach.
+
+A.
+   - straightforward but highly depends upon VFS internal.
+   - the atomic behavaiour is kept.
+   - some of parameters such as nameidata are hard to reproduce for
+     branch fs.
+   - large overhead.
+B.
+   - easy to implement.
+   - the atomic behavaiour is lost.
+C.
+   - the atomic behavaiour is kept.
+   - dirty and tricky.
+   - VFS checks whether the file is created correctly after calling
+     ->create(), which means this approach doesn't work.
+D.
+   - easy to implement.
+   - the atomic behavaiour is lost.
+   - to open a file with superuser's credential and give it to a user
+     process is a bad idea, since the file object keeps the credential
+     in it. It may affect LSM or something. This approach doesn't work
+     either.
+
+The approach A is ideal, but it hard to implement. So here is a
+variation of A, which is to be implemented.
+
+A-1. Introduce aufs_atomic_open()
+     - calls branch fs ->atomic_open() if exists. otherwise calls
+       vfs_create() and finish_open().
+     - the demerit is that the several checks after branch fs
+       ->atomic_open() are lost. in the ordinary case, the checks are
+       done by VFS:do_last(), lookup_open() and atomic_open(). some can
+       be implemented in aufs, but not all I am afraid.
diff --git b/Documentation/filesystems/aufs/design/03lookup.txt b/Documentation/filesystems/aufs/design/03lookup.txt
new file mode 100644
index 0000000..5b6b000
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/03lookup.txt
@@ -0,0 +1,100 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Lookup in a Branch
+----------------------------------------------------------------------
+Since aufs has a character of sub-VFS (see Introduction), it operates
+lookup for branches as VFS does. It may be a heavy work. But almost all
+lookup operation in aufs is the simplest case, ie. lookup only an entry
+directly connected to its parent. Digging down the directory hierarchy
+is unnecessary. VFS has a function lookup_one_len() for that use, and
+aufs calls it.
+
+When a branch is a remote filesystem, aufs basically relies upon its
+->d_revalidate(), also aufs forces the hardest revalidate tests for
+them.
+For d_revalidate, aufs implements three levels of revalidate tests. See
+"Revalidate Dentry and UDBA" in detail.
+
+
+Test Only the Highest One for the Directory Permission (dirperm1 option)
+----------------------------------------------------------------------
+Let's try case study.
+- aufs has two branches, upper readwrite and lower readonly.
+  /au = /rw + /ro
+- "dirA" exists under /ro, but /rw. and its mode is 0700.
+- user invoked "chmod a+rx /au/dirA"
+- the internal copy-up is activated and "/rw/dirA" is created and its
+  permission bits are set to world readable.
+- then "/au/dirA" becomes world readable?
+
+In this case, /ro/dirA is still 0700 since it exists in readonly branch,
+or it may be a natively readonly filesystem. If aufs respects the lower
+branch, it should not respond readdir request from other users. But user
+allowed it by chmod. Should really aufs rejects showing the entries
+under /ro/dirA?
+
+To be honest, I don't have a good solution for this case. So aufs
+implements 'dirperm1' and 'nodirperm1' mount options, and leave it to
+users.
+When dirperm1 is specified, aufs checks only the highest one for the
+directory permission, and shows the entries. Otherwise, as usual, checks
+every dir existing on all branches and rejects the request.
+
+As a side effect, dirperm1 option improves the performance of aufs
+because the number of permission check is reduced when the number of
+branch is many.
+
+
+Revalidate Dentry and UDBA (User's Direct Branch Access)
+----------------------------------------------------------------------
+Generally VFS helpers re-validate a dentry as a part of lookup.
+0. digging down the directory hierarchy.
+1. lock the parent dir by its i_mutex.
+2. lookup the final (child) entry.
+3. revalidate it.
+4. call the actual operation (create, unlink, etc.)
+5. unlock the parent dir
+
+If the filesystem implements its ->d_revalidate() (step 3), then it is
+called. Actually aufs implements it and checks the dentry on a branch is
+still valid.
+But it is not enough. Because aufs has to release the lock for the
+parent dir on a branch at the end of ->lookup() (step 2) and
+->d_revalidate() (step 3) while the i_mutex of the aufs dir is still
+held by VFS.
+If the file on a branch is changed directly, eg. bypassing aufs, after
+aufs released the lock, then the subsequent operation may cause
+something unpleasant result.
+
+This situation is a result of VFS architecture, ->lookup() and
+->d_revalidate() is separated. But I never say it is wrong. It is a good
+design from VFS's point of view. It is just not suitable for sub-VFS
+character in aufs.
+
+Aufs supports such case by three level of revalidation which is
+selectable by user.
+1. Simple Revalidate
+   Addition to the native flow in VFS's, confirm the child-parent
+   relationship on the branch just after locking the parent dir on the
+   branch in the "actual operation" (step 4). When this validation
+   fails, aufs returns EBUSY. ->d_revalidate() (step 3) in aufs still
+   checks the validation of the dentry on branches.
+2. Monitor Changes Internally by Inotify/Fsnotify
+   Addition to above, in the "actual operation" (step 4) aufs re-lookup
+   the dentry on the branch, and returns EBUSY if it finds different
+   dentry.
+   Additionally, aufs sets the inotify/fsnotify watch for every dir on branches
+   during it is in cache. When the event is notified, aufs registers a
+   function to kernel 'events' thread by schedule_work(). And the
+   function sets some special status to the cached aufs dentry and inode
+   private data. If they are not cached, then aufs has nothing to
+   do. When the same file is accessed through aufs (step 0-3) later,
+   aufs will detect the status and refresh all necessary data.
+   In this mode, aufs has to ignore the event which is fired by aufs
+   itself.
+3. No Extra Validation
+   This is the simplest test and doesn't add any additional revalidation
+   test, and skip the revalidation in step 4. It is useful and improves
+   aufs performance when system surely hide the aufs branches from user,
+   by over-mounting something (or another method).
diff --git b/Documentation/filesystems/aufs/design/04branch.txt b/Documentation/filesystems/aufs/design/04branch.txt
new file mode 100644
index 0000000..e68f4d3
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/04branch.txt
@@ -0,0 +1,61 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Branch Manipulation
+
+Since aufs supports dynamic branch manipulation, ie. add/remove a branch
+and changing its permission/attribute, there are a lot of works to do.
+
+
+Add a Branch
+----------------------------------------------------------------------
+o Confirm the adding dir exists outside of aufs, including loopback
+  mount, and its various attributes.
+o Initialize the xino file and whiteout bases if necessary.
+  See struct.txt.
+
+o Check the owner/group/mode of the directory
+  When the owner/group/mode of the adding directory differs from the
+  existing branch, aufs issues a warning because it may impose a
+  security risk.
+  For example, when a upper writable branch has a world writable empty
+  top directory, a malicious user can create any files on the writable
+  branch directly, like copy-up and modify manually. If something like
+  /etc/{passwd,shadow} exists on the lower readonly branch but the upper
+  writable branch, and the writable branch is world-writable, then a
+  malicious guy may create /etc/passwd on the writable branch directly
+  and the infected file will be valid in aufs.
+  I am afraid it can be a security issue, but aufs can do nothing except
+  producing a warning.
+
+
+Delete a Branch
+----------------------------------------------------------------------
+o Confirm the deleting branch is not busy
+  To be general, there is one merit to adopt "remount" interface to
+  manipulate branches. It is to discard caches. At deleting a branch,
+  aufs checks the still cached (and connected) dentries and inodes. If
+  there are any, then they are all in-use. An inode without its
+  corresponding dentry can be alive alone (for example, inotify/fsnotify case).
+
+  For the cached one, aufs checks whether the same named entry exists on
+  other branches.
+  If the cached one is a directory, because aufs provides a merged view
+  to users, as long as one dir is left on any branch aufs can show the
+  dir to users. In this case, the branch can be removed from aufs.
+  Otherwise aufs rejects deleting the branch.
+
+  If any file on the deleting branch is opened by aufs, then aufs
+  rejects deleting.
+
+
+Modify the Permission of a Branch
+----------------------------------------------------------------------
+o Re-initialize or remove the xino file and whiteout bases if necessary.
+  See struct.txt.
+
+o rw --> ro: Confirm the modifying branch is not busy
+  Aufs rejects the request if any of these conditions are true.
+  - a file on the branch is mmap-ed.
+  - a regular file on the branch is opened for write and there is no
+    same named entry on the upper branch.
diff --git b/Documentation/filesystems/aufs/design/05wbr_policy.txt b/Documentation/filesystems/aufs/design/05wbr_policy.txt
new file mode 100644
index 0000000..1726d5d
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/05wbr_policy.txt
@@ -0,0 +1,51 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Policies to Select One among Multiple Writable Branches
+----------------------------------------------------------------------
+When the number of writable branch is more than one, aufs has to decide
+the target branch for file creation or copy-up. By default, the highest
+writable branch which has the parent (or ancestor) dir of the target
+file is chosen (top-down-parent policy).
+By user's request, aufs implements some other policies to select the
+writable branch, for file creation several policies, round-robin,
+most-free-space, and other policies. For copy-up, top-down-parent,
+bottom-up-parent, bottom-up and others.
+
+As expected, the round-robin policy selects the branch in circular. When
+you have two writable branches and creates 10 new files, 5 files will be
+created for each branch. mkdir(2) systemcall is an exception. When you
+create 10 new directories, all will be created on the same branch.
+And the most-free-space policy selects the one which has most free
+space among the writable branches. The amount of free space will be
+checked by aufs internally, and users can specify its time interval.
+
+The policies for copy-up is more simple,
+top-down-parent is equivalent to the same named on in create policy,
+bottom-up-parent selects the writable branch where the parent dir
+exists and the nearest upper one from the copyup-source,
+bottom-up selects the nearest upper writable branch from the
+copyup-source, regardless the existence of the parent dir.
+
+There are some rules or exceptions to apply these policies.
+- If there is a readonly branch above the policy-selected branch and
+  the parent dir is marked as opaque (a variation of whiteout), or the
+  target (creating) file is whiteout-ed on the upper readonly branch,
+  then the result of the policy is ignored and the target file will be
+  created on the nearest upper writable branch than the readonly branch.
+- If there is a writable branch above the policy-selected branch and
+  the parent dir is marked as opaque or the target file is whiteouted
+  on the branch, then the result of the policy is ignored and the target
+  file will be created on the highest one among the upper writable
+  branches who has diropq or whiteout. In case of whiteout, aufs removes
+  it as usual.
+- link(2) and rename(2) systemcalls are exceptions in every policy.
+  They try selecting the branch where the source exists as possible
+  since copyup a large file will take long time. If it can't be,
+  ie. the branch where the source exists is readonly, then they will
+  follow the copyup policy.
+- There is an exception for rename(2) when the target exists.
+  If the rename target exists, aufs compares the index of the branches
+  where the source and the target exists and selects the higher
+  one. If the selected branch is readonly, then aufs follows the
+  copyup policy.
diff --git b/Documentation/filesystems/aufs/design/06fhsm.txt b/Documentation/filesystems/aufs/design/06fhsm.txt
new file mode 100644
index 0000000..84b46dc
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/06fhsm.txt
@@ -0,0 +1,105 @@
+
+# Copyright (C) 2011-2016 Junjiro R. Okajima
+
+File-based Hierarchical Storage Management (FHSM)
+----------------------------------------------------------------------
+Hierarchical Storage Management (or HSM) is a well-known feature in the
+storage world. Aufs provides this feature as file-based with multiple
+writable branches, based upon the principle of "Colder, the Lower".
+Here the word "colder" means that the less used files, and "lower" means
+that the position in the order of the stacked branches vertically.
+These multiple writable branches are prioritized, ie. the topmost one
+should be the fastest drive and be used heavily.
+
+o Characters in aufs FHSM story
+- aufs itself and a new branch attribute.
+- a new ioctl interface to move-down and to establish a connection with
+  the daemon ("move-down" is a converse of "copy-up").
+- userspace tool and daemon.
+
+The userspace daemon establishes a connection with aufs and waits for
+the notification. The notified information is very similar to struct
+statfs containing the number of consumed blocks and inodes.
+When the consumed blocks/inodes of a branch exceeds the user-specified
+upper watermark, the daemon activates its move-down process until the
+consumed blocks/inodes reaches the user-specified lower watermark.
+
+The actual move-down is done by aufs based upon the request from
+user-space since we need to maintain the inode number and the internal
+pointer arrays in aufs.
+
+Currently aufs FHSM handles the regular files only. Additionally they
+must not be hard-linked nor pseudo-linked.
+
+
+o Cowork of aufs and the user-space daemon
+  During the userspace daemon established the connection, aufs sends a
+  small notification to it whenever aufs writes something into the
+  writable branch. But it may cost high since aufs issues statfs(2)
+  internally. So user can specify a new option to cache the
+  info. Actually the notification is controlled by these factors.
+  + the specified cache time.
+  + classified as "force" by aufs internally.
+  Until the specified time expires, aufs doesn't send the info
+  except the forced cases. When aufs decide forcing, the info is always
+  notified to userspace.
+  For example, the number of free inodes is generally large enough and
+  the shortage of it happens rarely. So aufs doesn't force the
+  notification when creating a new file, directory and others. This is
+  the typical case which aufs doesn't force.
+  When aufs writes the actual filedata and the files consumes any of new
+  blocks, the aufs forces notifying.
+
+
+o Interfaces in aufs
+- New branch attribute.
+  + fhsm
+    Specifies that the branch is managed by FHSM feature. In other word,
+    participant in the FHSM.
+    When nofhsm is set to the branch, it will not be the source/target
+    branch of the move-down operation. This attribute is set
+    independently from coo and moo attributes, and if you want full
+    FHSM, you should specify them as well.
+- New mount option.
+  + fhsm_sec
+    Specifies a second to suppress many less important info to be
+    notified.
+- New ioctl.
+  + AUFS_CTL_FHSM_FD
+    create a new file descriptor which userspace can read the notification
+    (a subset of struct statfs) from aufs.
+- Module parameter 'brs'
+  It has to be set to 1. Otherwise the new mount option 'fhsm' will not
+  be set.
+- mount helpers /sbin/mount.aufs and /sbin/umount.aufs
+  When there are two or more branches with fhsm attributes,
+  /sbin/mount.aufs invokes the user-space daemon and /sbin/umount.aufs
+  terminates it. As a result of remounting and branch-manipulation, the
+  number of branches with fhsm attribute can be one. In this case,
+  /sbin/mount.aufs will terminate the user-space daemon.
+
+
+Finally the operation is done as these steps in kernel-space.
+- make sure that,
+  + no one else is using the file.
+  + the file is not hard-linked.
+  + the file is not pseudo-linked.
+  + the file is a regular file.
+  + the parent dir is not opaqued.
+- find the target writable branch.
+- make sure the file is not whiteout-ed by the upper (than the target)
+  branch.
+- make the parent dir on the target branch.
+- mutex lock the inode on the branch.
+- unlink the whiteout on the target branch (if exists).
+- lookup and create the whiteout-ed temporary name on the target branch.
+- copy the file as the whiteout-ed temporary name on the target branch.
+- rename the whiteout-ed temporary name to the original name.
+- unlink the file on the source branch.
+- maintain the internal pointer array and the external inode number
+  table (XINO).
+- maintain the timestamps and other attributes of the parent dir and the
+  file.
+
+And of course, in every step, an error may happen. So the operation
+should restore the original file state after an error happens.
diff --git b/Documentation/filesystems/aufs/design/06mmap.txt b/Documentation/filesystems/aufs/design/06mmap.txt
new file mode 100644
index 0000000..991c0b1
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/06mmap.txt
@@ -0,0 +1,59 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+mmap(2) -- File Memory Mapping
+----------------------------------------------------------------------
+In aufs, the file-mapped pages are handled by a branch fs directly, no
+interaction with aufs. It means aufs_mmap() calls the branch fs's
+->mmap().
+This approach is simple and good, but there is one problem.
+Under /proc, several entries show the mmapped files by its path (with
+device and inode number), and the printed path will be the path on the
+branch fs's instead of virtual aufs's.
+This is not a problem in most cases, but some utilities lsof(1) (and its
+user) may expect the path on aufs.
+
+To address this issue, aufs adds a new member called vm_prfile in struct
+vm_area_struct (and struct vm_region). The original vm_file points to
+the file on the branch fs in order to handle everything correctly as
+usual. The new vm_prfile points to a virtual file in aufs, and the
+show-functions in procfs refers to vm_prfile if it is set.
+Also we need to maintain several other places where touching vm_file
+such like
+- fork()/clone() copies vma and the reference count of vm_file is
+  incremented.
+- merging vma maintains the ref count too.
+
+This is not a good approach. It just fakes the printed path. But it
+leaves all behaviour around f_mapping unchanged. This is surely an
+advantage.
+Actually aufs had adopted another complicated approach which calls
+generic_file_mmap() and handles struct vm_operations_struct. In this
+approach, aufs met a hard problem and I could not solve it without
+switching the approach.
+
+There may be one more another approach which is
+- bind-mount the branch-root onto the aufs-root internally
+- grab the new vfsmount (ie. struct mount)
+- lazy-umount the branch-root internally
+- in open(2) the aufs-file, open the branch-file with the hidden
+  vfsmount (instead of the original branch's vfsmount)
+- ideally this "bind-mount and lazy-umount" should be done atomically,
+  but it may be possible from userspace by the mount helper.
+
+Adding the internal hidden vfsmount and using it in opening a file, the
+file path under /proc will be printed correctly. This approach looks
+smarter, but is not possible I am afraid.
+- aufs-root may be bind-mount later. when it happens, another hidden
+  vfsmount will be required.
+- it is hard to get the chance to bind-mount and lazy-umount
+  + in kernel-space, FS can have vfsmount in open(2) via
+    file->f_path, and aufs can know its vfsmount. But several locks are
+    already acquired, and if aufs tries to bind-mount and lazy-umount
+    here, then it may cause a deadlock.
+  + in user-space, bind-mount doesn't invoke the mount helper.
+- since /proc shows dev and ino, aufs has to give vma these info. it
+  means a new member vm_prinode will be necessary. this is essentially
+  equivalent to vm_prfile described above.
+
+I have to give up this "looks-smater" approach.
diff --git b/Documentation/filesystems/aufs/design/06xattr.txt b/Documentation/filesystems/aufs/design/06xattr.txt
new file mode 100644
index 0000000..7bfa94f
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/06xattr.txt
@@ -0,0 +1,81 @@
+
+# Copyright (C) 2014-2016 Junjiro R. Okajima
+
+Listing XATTR/EA and getting the value
+----------------------------------------------------------------------
+For the inode standard attributes (owner, group, timestamps, etc.), aufs
+shows the values from the topmost existing file. This behaviour is good
+for the non-dir entries since the bahaviour exactly matches the shown
+information. But for the directories, aufs considers all the same named
+entries on the lower branches. Which means, if one of the lower entry
+rejects readdir call, then aufs returns an error even if the topmost
+entry allows it. This behaviour is necessary to respect the branch fs's
+security, but can make users confused since the user-visible standard
+attributes don't match the behaviour.
+To address this issue, aufs has a mount option called dirperm1 which
+checks the permission for the topmost entry only, and ignores the lower
+entry's permission.
+
+A similar issue can happen around XATTR.
+getxattr(2) and listxattr(2) families behave as if dirperm1 option is
+always set. Otherwise these very unpleasant situation would happen.
+- listxattr(2) may return the duplicated entries.
+- users may not be able to remove or reset the XATTR forever,
+
+
+XATTR/EA support in the internal (copy,move)-(up,down)
+----------------------------------------------------------------------
+Generally the extended attributes of inode are categorized as these.
+- "security" for LSM and capability.
+- "system" for posix ACL, 'acl' mount option is required for the branch
+  fs generally.
+- "trusted" for userspace, CAP_SYS_ADMIN is required.
+- "user" for userspace, 'user_xattr' mount option is required for the
+  branch fs generally.
+
+Moreover there are some other categories. Aufs handles these rather
+unpopular categories as the ordinary ones, ie. there is no special
+condition nor exception.
+
+In copy-up, the support for XATTR on the dst branch may differ from the
+src branch. In this case, the copy-up operation will get an error and
+the original user operation which triggered the copy-up will fail. It
+can happen that even all copy-up will fail.
+When both of src and dst branches support XATTR and if an error occurs
+during copying XATTR, then the copy-up should fail obviously. That is a
+good reason and aufs should return an error to userspace. But when only
+the src branch support that XATTR, aufs should not return an error.
+For example, the src branch supports ACL but the dst branch doesn't
+because the dst branch may natively un-support it or temporary
+un-support it due to "noacl" mount option. Of course, the dst branch fs
+may NOT return an error even if the XATTR is not supported. It is
+totally up to the branch fs.
+
+Anyway when the aufs internal copy-up gets an error from the dst branch
+fs, then aufs tries removing the just copied entry and returns the error
+to the userspace. The worst case of this situation will be all copy-up
+will fail.
+
+For the copy-up operation, there two basic approaches.
+- copy the specified XATTR only (by category above), and return the
+  error unconditionally if it happens.
+- copy all XATTR, and ignore the error on the specified category only.
+
+In order to support XATTR and to implement the correct behaviour, aufs
+chooses the latter approach and introduces some new branch attributes,
+"icexsec", "icexsys", "icextr", "icexusr", and "icexoth".
+They correspond to the XATTR namespaces (see above). Additionally, to be
+convenient, "icex" is also provided which means all "icex*" attributes
+are set (here the word "icex" stands for "ignore copy-error on XATTR").
+
+The meaning of these attributes is to ignore the error from setting
+XATTR on that branch.
+Note that aufs tries copying all XATTR unconditionally, and ignores the
+error from the dst branch according to the specified attributes.
+
+Some XATTR may have its default value. The default value may come from
+the parent dir or the environment. If the default value is set at the
+file creating-time, it will be overwritten by copy-up.
+Some contradiction may happen I am afraid.
+Do we need another attribute to stop copying XATTR? I am unsure. For
+now, aufs implements the branch attributes to ignore the error.
diff --git b/Documentation/filesystems/aufs/design/07export.txt b/Documentation/filesystems/aufs/design/07export.txt
new file mode 100644
index 0000000..c23930b
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/07export.txt
@@ -0,0 +1,45 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Export Aufs via NFS
+----------------------------------------------------------------------
+Here is an approach.
+- like xino/xib, add a new file 'xigen' which stores aufs inode
+  generation.
+- iget_locked(): initialize aufs inode generation for a new inode, and
+  store it in xigen file.
+- destroy_inode(): increment aufs inode generation and store it in xigen
+  file. it is necessary even if it is not unlinked, because any data of
+  inode may be changed by UDBA.
+- encode_fh(): for a root dir, simply return FILEID_ROOT. otherwise
+  build file handle by
+  + branch id (4 bytes)
+  + superblock generation (4 bytes)
+  + inode number (4 or 8 bytes)
+  + parent dir inode number (4 or 8 bytes)
+  + inode generation (4 bytes))
+  + return value of exportfs_encode_fh() for the parent on a branch (4
+    bytes)
+  + file handle for a branch (by exportfs_encode_fh())
+- fh_to_dentry():
+  + find the index of a branch from its id in handle, and check it is
+    still exist in aufs.
+  + 1st level: get the inode number from handle and search it in cache.
+  + 2nd level: if not found in cache, get the parent inode number from
+    the handle and search it in cache. and then open the found parent
+    dir, find the matching inode number by vfs_readdir() and get its
+    name, and call lookup_one_len() for the target dentry.
+  + 3rd level: if the parent dir is not cached, call
+    exportfs_decode_fh() for a branch and get the parent on a branch,
+    build a pathname of it, convert it a pathname in aufs, call
+    path_lookup(). now aufs gets a parent dir dentry, then handle it as
+    the 2nd level.
+  + to open the dir, aufs needs struct vfsmount. aufs keeps vfsmount
+    for every branch, but not itself. to get this, (currently) aufs
+    searches in current->nsproxy->mnt_ns list. it may not be a good
+    idea, but I didn't get other approach.
+  + test the generation of the gotten inode.
+- every inode operation: they may get EBUSY due to UDBA. in this case,
+  convert it into ESTALE for NFSD.
+- readdir(): call lockdep_on/off() because filldir in NFSD calls
+  lookup_one_len(), vfs_getattr(), encode_fh() and others.
diff --git b/Documentation/filesystems/aufs/design/08shwh.txt b/Documentation/filesystems/aufs/design/08shwh.txt
new file mode 100644
index 0000000..ad58ebe
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/08shwh.txt
@@ -0,0 +1,39 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Show Whiteout Mode (shwh)
+----------------------------------------------------------------------
+Generally aufs hides the name of whiteouts. But in some cases, to show
+them is very useful for users. For instance, creating a new middle layer
+(branch) by merging existing layers.
+
+(borrowing aufs1 HOW-TO from a user, Michael Towers)
+When you have three branches,
+- Bottom: 'system', squashfs (underlying base system), read-only
+- Middle: 'mods', squashfs, read-only
+- Top: 'overlay', ram (tmpfs), read-write
+
+The top layer is loaded at boot time and saved at shutdown, to preserve
+the changes made to the system during the session.
+When larger changes have been made, or smaller changes have accumulated,
+the size of the saved top layer data grows. At this point, it would be
+nice to be able to merge the two overlay branches ('mods' and 'overlay')
+and rewrite the 'mods' squashfs, clearing the top layer and thus
+restoring save and load speed.
+
+This merging is simplified by the use of another aufs mount, of just the
+two overlay branches using the 'shwh' option.
+# mount -t aufs -o ro,shwh,br:/livesys/overlay=ro+wh:/livesys/mods=rr+wh \
+	aufs /livesys/merge_union
+
+A merged view of these two branches is then available at
+/livesys/merge_union, and the new feature is that the whiteouts are
+visible!
+Note that in 'shwh' mode the aufs mount must be 'ro', which will disable
+writing to all branches. Also the default mode for all branches is 'ro'.
+It is now possible to save the combined contents of the two overlay
+branches to a new squashfs, e.g.:
+# mksquashfs /livesys/merge_union /path/to/newmods.squash
+
+This new squashfs archive can be stored on the boot device and the
+initramfs will use it to replace the old one at the next boot.
diff --git b/Documentation/filesystems/aufs/design/10dynop.txt b/Documentation/filesystems/aufs/design/10dynop.txt
new file mode 100644
index 0000000..49afc58
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/10dynop.txt
@@ -0,0 +1,34 @@
+
+# Copyright (C) 2010-2016 Junjiro R. Okajima
+
+Dynamically customizable FS operations
+----------------------------------------------------------------------
+Generally FS operations (struct inode_operations, struct
+address_space_operations, struct file_operations, etc.) are defined as
+"static const", but it never means that FS have only one set of
+operation. Some FS have multiple sets of them. For instance, ext2 has
+three sets, one for XIP, for NOBH, and for normal.
+Since aufs overrides and redirects these operations, sometimes aufs has
+to change its behaviour according to the branch FS type. More importantly
+VFS acts differently if a function (member in the struct) is set or
+not. It means aufs should have several sets of operations and select one
+among them according to the branch FS definition.
+
+In order to solve this problem and not to affect the behaviour of VFS,
+aufs defines these operations dynamically. For instance, aufs defines
+dummy direct_IO function for struct address_space_operations, but it may
+not be set to the address_space_operations actually. When the branch FS
+doesn't have it, aufs doesn't set it to its address_space_operations
+while the function definition itself is still alive. So the behaviour
+itself will not change, and it will return an error when direct_IO is
+not set.
+
+The lifetime of these dynamically generated operation object is
+maintained by aufs branch object. When the branch is removed from aufs,
+the reference counter of the object is decremented. When it reaches
+zero, the dynamically generated operation object will be freed.
+
+This approach is designed to support AIO (io_submit), Direct I/O and
+XIP (DAX) mainly.
+Currently this approach is applied to address_space_operations for
+regular files only.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 21e4b48..6a263f1 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3528,6 +3528,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	sched_debug	[KNL] Enables verbose scheduler debug messages.
 
+	schedstats=	[KNL,X86] Enable or disable scheduled statistics.
+			Allowed values are enable and disable. This feature
+			incurs a small amount of overhead in the scheduler
+			but is useful for debugging and performance tuning.
+
 	skew_tick=	[KNL] Offset the periodic timer tick per cpu to mitigate
 			xtime_lock contention on larger systems, and/or RCU lock
 			contention on all systems with CONFIG_MAXSMP set.
@@ -4056,6 +4061,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 					HIGHMEM regardless of setting
 					of CONFIG_HIGHPTE.
 
+	uuid_debug=	(Boolean) whether to enable debugging of TuxOnIce's
+			uuid support.
+
 	vdso=		[X86,SH]
 			On X86_32, this is an alias for vdso32=.  Otherwise:
 
diff --git b/Documentation/power/tuxonice-internals.txt b/Documentation/power/tuxonice-internals.txt
new file mode 100644
index 0000000..0c6a216
--- /dev/null
+++ b/Documentation/power/tuxonice-internals.txt
@@ -0,0 +1,532 @@
+		   TuxOnIce 4.0 Internal Documentation.
+			Updated to 23 March 2015
+
+(Please note that incremental image support mentioned in this document is work
+in progress. This document may need updating prior to the actual release of
+4.0!)
+
+1.  Introduction.
+
+    TuxOnIce 4.0 is an addition to the Linux Kernel, designed to
+    allow the user to quickly shutdown and quickly boot a computer, without
+    needing to close documents or programs. It is equivalent to the
+    hibernate facility in some laptops. This implementation, however,
+    requires no special BIOS or hardware support.
+
+    The code in these files is based upon the original implementation
+    prepared by Gabor Kuti and additional work by Pavel Machek and a
+    host of others. This code has been substantially reworked by Nigel
+    Cunningham, again with the help and testing of many others, not the
+    least of whom are Bernard Blackham and Michael Frank. At its heart,
+    however, the operation is essentially the same as Gabor's version.
+
+2.  Overview of operation.
+
+    The basic sequence of operations is as follows:
+
+	a. Quiesce all other activity.
+	b. Ensure enough memory and storage space are available, and attempt
+	   to free memory/storage if necessary.
+	c. Allocate the required memory and storage space.
+	d. Write the image.
+	e. Power down.
+
+    There are a number of complicating factors which mean that things are
+    not as simple as the above would imply, however...
+
+    o The activity of each process must be stopped at a point where it will
+    not be holding locks necessary for saving the image, or unexpectedly
+    restart operations due to something like a timeout and thereby make
+    our image inconsistent.
+
+    o It is desirous that we sync outstanding I/O to disk before calculating
+    image statistics. This reduces corruption if one should suspend but
+    then not resume, and also makes later parts of the operation safer (see
+    below).
+
+    o We need to get as close as we can to an atomic copy of the data.
+    Inconsistencies in the image will result in inconsistent memory contents at
+    resume time, and thus in instability of the system and/or file system
+    corruption. This would appear to imply a maximum image size of one half of
+    the amount of RAM, but we have a solution... (again, below).
+
+    o In 2.6 and later, we choose to play nicely with the other suspend-to-disk
+    implementations.
+
+3.  Detailed description of internals.
+
+    a. Quiescing activity.
+
+    Safely quiescing the system is achieved using three separate but related
+    aspects.
+
+    First, we use the vanilla kerne's support for freezing processes. This code
+    is based on the observation that the vast majority of processes don't need
+    to run during suspend. They can be 'frozen'. The kernel therefore
+    implements a refrigerator routine, which processes enter and in which they
+    remain until the cycle is complete. Processes enter the refrigerator via
+    try_to_freeze() invocations at appropriate places.  A process cannot be
+    frozen in any old place. It must not be holding locks that will be needed
+    for writing the image or freezing other processes. For this reason,
+    userspace processes generally enter the refrigerator via the signal
+    handling code, and kernel threads at the place in their event loops where
+    they drop locks and yield to other processes or sleep. The task of freezing
+    processes is complicated by the fact that there can be interdependencies
+    between processes. Freezing process A before process B may mean that
+    process B cannot be frozen, because it stops at waiting for process A
+    rather than in the refrigerator. This issue is seen where userspace waits
+    on freezeable kernel threads or fuse filesystem threads. To address this
+    issue, we implement the following algorithm for quiescing activity:
+
+	- Freeze filesystems (including fuse - userspace programs starting
+		new requests are immediately frozen; programs already running
+		requests complete their work before being frozen in the next
+		step)
+	- Freeze userspace
+	- Thaw filesystems (this is safe now that userspace is frozen and no
+		fuse requests are outstanding).
+	- Invoke sys_sync (noop on fuse).
+	- Freeze filesystems
+	- Freeze kernel threads
+
+    If we need to free memory, we thaw kernel threads and filesystems, but not
+    userspace. We can then free caches without worrying about deadlocks due to
+    swap files being on frozen filesystems or such like.
+
+    b. Ensure enough memory & storage are available.
+
+    We have a number of constraints to meet in order to be able to successfully
+    suspend and resume.
+
+    First, the image will be written in two parts, described below. One of
+    these parts needs to have an atomic copy made, which of course implies a
+    maximum size of one half of the amount of system memory. The other part
+    ('pageset') is not atomically copied, and can therefore be as large or
+    small as desired.
+
+    Second, we have constraints on the amount of storage available. In these
+    calculations, we may also consider any compression that will be done. The
+    cryptoapi module allows the user to configure an expected compression ratio.
+
+    Third, the user can specify an arbitrary limit on the image size, in
+    megabytes. This limit is treated as a soft limit, so that we don't fail the
+    attempt to suspend if we cannot meet this constraint.
+
+    c. Allocate the required memory and storage space.
+
+    Having done the initial freeze, we determine whether the above constraints
+    are met, and seek to allocate the metadata for the image. If the constraints
+    are not met, or we fail to allocate the required space for the metadata, we
+    seek to free the amount of memory that we calculate is needed and try again.
+    We allow up to four iterations of this loop before aborting the cycle. If
+    we do fail, it should only be because of a bug in TuxOnIce's calculations
+    or the vanilla kernel code for freeing memory.
+
+    These steps are merged together in the prepare_image function, found in
+    prepare_image.c. The functions are merged because of the cyclical nature
+    of the problem of calculating how much memory and storage is needed. Since
+    the data structures containing the information about the image must
+    themselves take memory and use storage, the amount of memory and storage
+    required changes as we prepare the image. Since the changes are not large,
+    only one or two iterations will be required to achieve a solution.
+
+    The recursive nature of the algorithm is miminised by keeping user space
+    frozen while preparing the image, and by the fact that our records of which
+    pages are to be saved and which pageset they are saved in use bitmaps (so
+    that changes in number or fragmentation of the pages to be saved don't
+    feedback via changes in the amount of memory needed for metadata). The
+    recursiveness is thus limited to any extra slab pages allocated to store the
+    extents that record storage used, and the effects of seeking to free memory.
+
+    d. Write the image.
+
+    We previously mentioned the need to create an atomic copy of the data, and
+    the half-of-memory limitation that is implied in this. This limitation is
+    circumvented by dividing the memory to be saved into two parts, called
+    pagesets.
+
+    Pageset2 contains most of the page cache - the pages on the active and
+    inactive LRU lists that aren't needed or modified while TuxOnIce is
+    running, so they can be safely written without an atomic copy. They are
+    therefore saved first and reloaded last. While saving these pages,
+    TuxOnIce carefully ensures that the work of writing the pages doesn't make
+    the image inconsistent. With the support for Kernel (Video) Mode Setting
+    going into the kernel at the time of writing, we need to check for pages
+    on the LRU that are used by KMS, and exclude them from pageset2. They are
+    atomically copied as part of pageset 1.
+
+    Once pageset2 has been saved, we prepare to do the atomic copy of remaining
+    memory. As part of the preparation, we power down drivers, thereby providing
+    them with the opportunity to have their state recorded in the image. The
+    amount of memory allocated by drivers for this is usually negligible, but if
+    DRI is in use, video drivers may require significants amounts. Ideally we
+    would be able to query drivers while preparing the image as to the amount of
+    memory they will need. Unfortunately no such mechanism exists at the time of
+    writing. For this reason, TuxOnIce allows the user to set an
+    'extra_pages_allowance', which is used to seek to ensure sufficient memory
+    is available for drivers at this point. TuxOnIce also lets the user set this
+    value to 0. In this case, a test driver suspend is done while preparing the
+    image, and the difference (plus a margin) used instead. TuxOnIce will also
+    automatically restart the hibernation process (twice at most) if it finds
+    that the extra pages allowance is not sufficient. It will then use what was
+    actually needed (plus a margin, again). Failure to hibernate should thus
+    be an extremely rare occurence.
+
+    Having suspended the drivers, we save the CPU context before making an
+    atomic copy of pageset1, resuming the drivers and saving the atomic copy.
+    After saving the two pagesets, we just need to save our metadata before
+    powering down.
+
+    As we mentioned earlier, the contents of pageset2 pages aren't needed once
+    they've been saved. We therefore use them as the destination of our atomic
+    copy. In the unlikely event that pageset1 is larger, extra pages are
+    allocated while the image is being prepared. This is normally only a real
+    possibility when the system has just been booted and the page cache is
+    small.
+
+    This is where we need to be careful about syncing, however. Pageset2 will
+    probably contain filesystem meta data. If this is overwritten with pageset1
+    and then a sync occurs, the filesystem will be corrupted - at least until
+    resume time and another sync of the restored data. Since there is a
+    possibility that the user might not resume or (may it never be!) that
+    TuxOnIce might oops, we do our utmost to avoid syncing filesystems after
+    copying pageset1.
+
+    e. Incremental images
+
+    TuxOnIce 4.0 introduces a new incremental image mode which changes things a
+    little. When incremental images are enabled, we save a 'normal' image the
+    first time we hibernate. One resume however, we do not free the image or
+    the associated storage. Instead, it is retained until the next attempt at
+    hibernating and a mechanism is enabled which is used to track which pages
+    of memory are modified between the two cycles. The modified pages can then
+    be added to the existing image, rather than unmodified pages being saved
+    again unnecessarily.
+
+    Incremental image support is available in 64 bit Linux only, due to the
+    requirement for extra page flags.
+
+    This support is accomplished in the following way:
+
+    1) Tracking of pages.
+
+    The tracking of changed pages is accomplished using the page fault
+    mechanism. When we reach a point at which we want to start tracking
+    changes, most pages are marked read-only and also flagged as being
+    read-only because of this support. Since this cannot happen for every page
+    of RAM, some are marked as untracked and always treated as modified whn
+    preparing an incremental iamge. When a process attempts to modify a page
+    that is marked read-only in this way, a page fault occurs, with TuxOnIce
+    code marking the page writable and dirty before allowing the write to
+    continue. In this way, the effect of incremental images on performance is
+    minimised - a page only causes a fault once. Small modifications to the
+    page allocator further reduce the number of faults that occur - free pages
+    are not tracked; they are made writable and marked as dirty as part of
+    being allocated.
+
+    2) Saving the incremental image / atomicity.
+
+    The page fault mechanism is also used to improve the means by which
+    atomicity of the image is acheived. When it is time to do an atomic copy,
+    the flags for pages are reset, with the result being that it is no longer
+    necessary for us to do an atomic of pageset1. Instead, we normally write
+    the uncopied pages to disk. When an attempt is made to modify a page that
+    has not yet been saved, the page-fault mechanism makes a copy of the page
+    prior to allowing the write. This copy is then written to disk. Likewise,
+    on resume, if a process attempts to write to a page that has been read
+    while the rest of the image is still being loaded, a copy of that page is
+    made prior to the write being allowed. At the end of loading the image,
+    modified pages can thus be restored to their 'atomic copy' contents prior
+    to restarting normal operation. We also mark pages that are yet to be read
+    as invalid PFNs, so that we can capture as a bug any attempt by a
+    half-restored kernel to access a page that hasn't yet been reloaded.
+
+    f. Power down.
+
+    Powering down uses standard kernel routines. TuxOnIce supports powering down
+    using the ACPI S3, S4 and S5 methods or the kernel's non-ACPI power-off.
+    Supporting suspend to ram (S3) as a power off option might sound strange,
+    but it allows the user to quickly get their system up and running again if
+    the battery doesn't run out (we just need to re-read the overwritten pages)
+    and if the battery does run out (or the user removes power), they can still
+    resume.
+
+4.  Data Structures.
+
+    TuxOnIce uses three main structures to store its metadata and configuration
+    information:
+
+    a) Pageflags bitmaps.
+
+    TuxOnIce records which pages will be in pageset1, pageset2, the destination
+    of the atomic copy and the source of the atomically restored image using
+    bitmaps. The code used is that written for swsusp, with small improvements
+    to match TuxOnIce's requirements.
+
+    The pageset1 bitmap is thus easily stored in the image header for use at
+    resume time.
+
+    As mentioned above, using bitmaps also means that the amount of memory and
+    storage required for recording the above information is constant. This
+    greatly simplifies the work of preparing the image. In earlier versions of
+    TuxOnIce, extents were used to record which pages would be stored. In that
+    case, however, eating memory could result in greater fragmentation of the
+    lists of pages, which in turn required more memory to store the extents and
+    more storage in the image header. These could in turn require further
+    freeing of memory, and another iteration. All of this complexity is removed
+    by having bitmaps.
+
+    Bitmaps also make a lot of sense because TuxOnIce only ever iterates
+    through the lists. There is therefore no cost to not being able to find the
+    nth page in order 0 time. We only need to worry about the cost of finding
+    the n+1th page, given the location of the nth page. Bitwise optimisations
+    help here.
+
+    b) Extents for block data.
+
+    TuxOnIce supports writing the image to multiple block devices. In the case
+    of swap, multiple partitions and/or files may be in use, and we happily use
+    them all (with the exception of compcache pages, which we allocate but do
+    not use). This use of multiple block devices is accomplished as follows:
+
+    Whatever the actual source of the allocated storage, the destination of the
+    image can be viewed in terms of one or more block devices, and on each
+    device, a list of sectors. To simplify matters, we only use contiguous,
+    PAGE_SIZE aligned sectors, like the swap code does.
+
+    Since sector numbers on each bdev may well not start at 0, it makes much
+    more sense to use extents here. Contiguous ranges of pages can thus be
+    represented in the extents by contiguous values.
+
+    Variations in block size are taken account of in transforming this data
+    into the parameters for bio submission.
+
+    We can thus implement a layer of abstraction wherein the core of TuxOnIce
+    doesn't have to worry about which device we're currently writing to or
+    where in the device we are. It simply requests that the next page in the
+    pageset or header be written, leaving the details to this lower layer.
+    The lower layer remembers where in the sequence of devices and blocks each
+    pageset starts. The header always starts at the beginning of the allocated
+    storage.
+
+    So extents are:
+
+    struct extent {
+      unsigned long minimum, maximum;
+      struct extent *next;
+    }
+
+    These are combined into chains of extents for a device:
+
+    struct extent_chain {
+      int size; /* size of the extent ie sum (max-min+1) */
+      int allocs, frees;
+      char *name;
+      struct extent *first, *last_touched;
+    };
+
+    For each bdev, we need to store a little more info (simplified definition):
+
+    struct toi_bdev_info {
+       struct block_device *bdev;
+
+       char uuid[17];
+       dev_t dev_t;
+       int bmap_shift;
+       int blocks_per_page;
+    };
+
+    The uuid is the main means used to identify the device in the storage
+    image. This means we can cope with the dev_t representation of a device
+    changing between saving the image and restoring it, as may happen on some
+    bioses or in the LVM case.
+
+    bmap_shift and blocks_per_page apply the effects of variations in blocks
+    per page settings for the filesystem and underlying bdev. For most
+    filesystems, these are the same, but for xfs, they can have independant
+    values.
+
+    Combining these two structures together, we have everything we need to
+    record what devices and what blocks on each device are being used to
+    store the image, and to submit i/o using bio_submit.
+
+    The last elements in the picture are a means of recording how the storage
+    is being used.
+
+    We do this first and foremost by implementing a layer of abstraction on
+    top of the devices and extent chains which allows us to view however many
+    devices there might be as one long storage tape, with a single 'head' that
+    tracks a 'current position' on the tape:
+
+    struct extent_iterate_state {
+      struct extent_chain *chains;
+      int num_chains;
+      int current_chain;
+      struct extent *current_extent;
+      unsigned long current_offset;
+    };
+
+    That is, *chains points to an array of size num_chains of extent chains.
+    For the filewriter, this is always a single chain. For the swapwriter, the
+    array is of size MAX_SWAPFILES.
+
+    current_chain, current_extent and current_offset thus point to the current
+    index in the chains array (and into a matching array of struct
+    suspend_bdev_info), the current extent in that chain (to optimise access),
+    and the current value in the offset.
+
+    The image is divided into three parts:
+    - The header
+    - Pageset 1
+    - Pageset 2
+
+    The header always starts at the first device and first block. We know its
+    size before we begin to save the image because we carefully account for
+    everything that will be stored in it.
+
+    The second pageset (LRU) is stored first. It begins on the next page after
+    the end of the header.
+
+    The first pageset is stored second. It's start location is only known once
+    pageset2 has been saved, since pageset2 may be compressed as it is written.
+    This location is thus recorded at the end of saving pageset2. It is page
+    aligned also.
+
+    Since this information is needed at resume time, and the location of extents
+    in memory will differ at resume time, this needs to be stored in a portable
+    way:
+
+    struct extent_iterate_saved_state {
+        int chain_num;
+        int extent_num;
+        unsigned long offset;
+    };
+
+    We can thus implement a layer of abstraction wherein the core of TuxOnIce
+    doesn't have to worry about which device we're currently writing to or
+    where in the device we are. It simply requests that the next page in the
+    pageset or header be written, leaving the details to this layer, and
+    invokes the routines to remember and restore the position, without having
+    to worry about the details of how the data is arranged on disk or such like.
+
+    c) Modules
+
+    One aim in designing TuxOnIce was to make it flexible. We wanted to allow
+    for the implementation of different methods of transforming a page to be
+    written to disk and different methods of getting the pages stored.
+
+    In early versions (the betas and perhaps Suspend1), compression support was
+    inlined in the image writing code, and the data structures and code for
+    managing swap were intertwined with the rest of the code. A number of people
+    had expressed interest in implementing image encryption, and alternative
+    methods of storing the image.
+
+    In order to achieve this, TuxOnIce was given a modular design.
+
+    A module is a single file which encapsulates the functionality needed
+    to transform a pageset of data (encryption or compression, for example),
+    or to write the pageset to a device. The former type of module is called
+    a 'page-transformer', the later a 'writer'.
+
+    Modules are linked together in pipeline fashion. There may be zero or more
+    page transformers in a pipeline, and there is always exactly one writer.
+    The pipeline follows this pattern:
+
+		---------------------------------
+		|          TuxOnIce Core        |
+		---------------------------------
+				|
+				|
+		---------------------------------
+		|	Page transformer 1	|
+		---------------------------------
+				|
+				|
+		---------------------------------
+		|	Page transformer 2	|
+		---------------------------------
+				|
+				|
+		---------------------------------
+		|            Writer		|
+		---------------------------------
+
+    During the writing of an image, the core code feeds pages one at a time
+    to the first module. This module performs whatever transformations it
+    implements on the incoming data, completely consuming the incoming data and
+    feeding output in a similar manner to the next module.
+
+    All routines are SMP safe, and the final result of the transformations is
+    written with an index (provided by the core) and size of the output by the
+    writer. As a result, we can have multithreaded I/O without needing to
+    worry about the sequence in which pages are written (or read).
+
+    During reading, the pipeline works in the reverse direction. The core code
+    calls the first module with the address of a buffer which should be filled.
+    (Note that the buffer size is always PAGE_SIZE at this time). This module
+    will in turn request data from the next module and so on down until the
+    writer is made to read from the stored image.
+
+    Part of definition of the structure of a module thus looks like this:
+
+        int (*rw_init) (int rw, int stream_number);
+        int (*rw_cleanup) (int rw);
+        int (*write_chunk) (struct page *buffer_page);
+        int (*read_chunk) (struct page *buffer_page, int sync);
+
+    It should be noted that the _cleanup routine may be called before the
+    full stream of data has been read or written. While writing the image,
+    the user may (depending upon settings) choose to abort suspending, and
+    if we are in the midst of writing the last portion of the image, a portion
+    of the second pageset may be reread. This may also happen if an error
+    occurs and we seek to abort the process of writing the image.
+
+    The modular design is also useful in a number of other ways. It provides
+    a means where by we can add support for:
+
+    - providing overall initialisation and cleanup routines;
+    - serialising configuration information in the image header;
+    - providing debugging information to the user;
+    - determining memory and image storage requirements;
+    - dis/enabling components at run-time;
+    - configuring the module (see below);
+
+    ...and routines for writers specific to their work:
+    - Parsing a resume= location;
+    - Determining whether an image exists;
+    - Marking a resume as having been attempted;
+    - Invalidating an image;
+
+    Since some parts of the core - the user interface and storage manager
+    support - have use for some of these functions, they are registered as
+    'miscellaneous' modules as well.
+
+    d) Sysfs data structures.
+
+    This brings us naturally to support for configuring TuxOnIce. We desired to
+    provide a way to make TuxOnIce as flexible and configurable as possible.
+    The user shouldn't have to reboot just because they want to now hibernate to
+    a file instead of a partition, for example.
+
+    To accomplish this, TuxOnIce implements a very generic means whereby the
+    core and modules can register new sysfs entries. All TuxOnIce entries use
+    a single _store and _show routine, both of which are found in
+    tuxonice_sysfs.c in the kernel/power directory. These routines handle the
+    most common operations - getting and setting the values of bits, integers,
+    longs, unsigned longs and strings in one place, and allow overrides for
+    customised get and set options as well as side-effect routines for all
+    reads and writes.
+
+    When combined with some simple macros, a new sysfs entry can then be defined
+    in just a couple of lines:
+
+        SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
+                        2048, 0, NULL),
+
+    This defines a sysfs entry named "progress_granularity" which is rw and
+    allows the user to access an integer stored at &progress_granularity, giving
+    it a value between 1 and 2048 inclusive.
+
+    Sysfs entries are registered under /sys/power/tuxonice, and entries for
+    modules are located in a subdirectory named after the module.
+
diff --git b/Documentation/power/tuxonice.txt b/Documentation/power/tuxonice.txt
new file mode 100644
index 0000000..3bf0575
--- /dev/null
+++ b/Documentation/power/tuxonice.txt
@@ -0,0 +1,948 @@
+	--- TuxOnIce, version 3.0 ---
+
+1.  What is it?
+2.  Why would you want it?
+3.  What do you need to use it?
+4.  Why not just use the version already in the kernel?
+5.  How do you use it?
+6.  What do all those entries in /sys/power/tuxonice do?
+7.  How do you get support?
+8.  I think I've found a bug. What should I do?
+9.  When will XXX be supported?
+10  How does it work?
+11. Who wrote TuxOnIce?
+
+1. What is it?
+
+   Imagine you're sitting at your computer, working away. For some reason, you
+   need to turn off your computer for a while - perhaps it's time to go home
+   for the day. When you come back to your computer next, you're going to want
+   to carry on where you left off. Now imagine that you could push a button and
+   have your computer store the contents of its memory to disk and power down.
+   Then, when you next start up your computer, it loads that image back into
+   memory and you can carry on from where you were, just as if you'd never
+   turned the computer off. You have far less time to start up, no reopening of
+   applications or finding what directory you put that file in yesterday.
+   That's what TuxOnIce does.
+
+   TuxOnIce has a long heritage. It began life as work by Gabor Kuti, who,
+   with some help from Pavel Machek, got an early version going in 1999. The
+   project was then taken over by Florent Chabaud while still in alpha version
+   numbers. Nigel Cunningham came on the scene when Florent was unable to
+   continue, moving the project into betas, then 1.0, 2.0 and so on up to
+   the present series. During the 2.0 series, the name was contracted to
+   Suspend2 and the website suspend2.net created. Beginning around July 2007,
+   a transition to calling the software TuxOnIce was made, to seek to help
+   make it clear that TuxOnIce is more concerned with hibernation than suspend
+   to ram.
+
+   Pavel Machek's swsusp code, which was merged around 2.5.17 retains the
+   original name, and was essentially a fork of the beta code until Rafael
+   Wysocki came on the scene in 2005 and began to improve it further.
+
+2. Why would you want it?
+
+   Why wouldn't you want it?
+
+   Being able to save the state of your system and quickly restore it improves
+   your productivity - you get a useful system in far less time than through
+   the normal boot process. You also get to be completely 'green', using zero
+   power, or as close to that as possible (the computer may still provide
+   minimal power to some devices, so they can initiate a power on, but that
+   will be the same amount of power as would be used if you told the computer
+   to shutdown.
+
+3. What do you need to use it?
+
+   a. Kernel Support.
+
+   i) The TuxOnIce patch.
+
+   TuxOnIce is part of the Linux Kernel. This version is not part of Linus's
+   2.6 tree at the moment, so you will need to download the kernel source and
+   apply the latest patch. Having done that, enable the appropriate options in
+   make [menu|x]config (under Power Management Options - look for "Enhanced
+   Hibernation"), compile and install your kernel. TuxOnIce works with SMP,
+   Highmem, preemption, fuse filesystems, x86-32, PPC and x86_64.
+
+   TuxOnIce patches are available from http://tuxonice.net.
+
+   ii) Compression support.
+
+   Compression support is implemented via the cryptoapi. You will therefore want
+   to select any Cryptoapi transforms that you want to use on your image from
+   the Cryptoapi menu while configuring your kernel. We recommend the use of the
+   LZO compression method - it is very fast and still achieves good compression.
+
+   You can also tell TuxOnIce to write its image to an encrypted and/or
+   compressed filesystem/swap partition. In that case, you don't need to do
+   anything special for TuxOnIce when it comes to kernel configuration.
+
+   iii) Configuring other options.
+
+   While you're configuring your kernel, try to configure as much as possible
+   to build as modules. We recommend this because there are a number of drivers
+   that are still in the process of implementing proper power management
+   support. In those cases, the best way to work around their current lack is
+   to build them as modules and remove the modules while hibernating. You might
+   also bug the driver authors to get their support up to speed, or even help!
+
+   b. Storage.
+
+   i) Swap.
+
+   TuxOnIce can store the hibernation image in your swap partition, a swap file or
+   a combination thereof. Whichever combination you choose, you will probably
+   want to create enough swap space to store the largest image you could have,
+   plus the space you'd normally use for swap. A good rule of thumb would be
+   to calculate the amount of swap you'd want without using TuxOnIce, and then
+   add the amount of memory you have. This swapspace can be arranged in any way
+   you'd like. It can be in one partition or file, or spread over a number. The
+   only requirement is that they be active when you start a hibernation cycle.
+
+   There is one exception to this requirement. TuxOnIce has the ability to turn
+   on one swap file or partition at the start of hibernating and turn it back off
+   at the end. If you want to ensure you have enough memory to store a image
+   when your memory is fully used, you might want to make one swap partition or
+   file for 'normal' use, and another for TuxOnIce to activate & deactivate
+   automatically. (Further details below).
+
+   ii) Normal files.
+
+   TuxOnIce includes a 'file allocator'. The file allocator can store your
+   image in a simple file. Since Linux has the concept of everything being a
+   file, this is more powerful than it initially sounds. If, for example, you
+   were to set up a network block device file, you could hibernate to a network
+   server. This has been tested and works to a point, but nbd itself isn't
+   stateless enough for our purposes.
+
+   Take extra care when setting up the file allocator. If you just type
+   commands without thinking and then try to hibernate, you could cause
+   irreversible corruption on your filesystems! Make sure you have backups.
+
+   Most people will only want to hibernate to a local file. To achieve that, do
+   something along the lines of:
+
+   echo "TuxOnIce" > /hibernation-file
+   dd if=/dev/zero bs=1M count=512 >> /hibernation-file
+
+   This will create a 512MB file called /hibernation-file. To get TuxOnIce to use
+   it:
+
+   echo /hibernation-file > /sys/power/tuxonice/file/target
+
+   Then
+
+   cat /sys/power/tuxonice/resume
+
+   Put the results of this into your bootloader's configuration (see also step
+   C, below):
+
+   ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
+   # cat /sys/power/tuxonice/resume
+   file:/dev/hda2:0x1e001
+
+   In this example, we would edit the append= line of our lilo.conf|menu.lst
+   so that it included:
+
+   resume=file:/dev/hda2:0x1e001
+   ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
+
+   For those who are thinking 'Could I make the file sparse?', the answer is
+   'No!'. At the moment, there is no way for TuxOnIce to fill in the holes in
+   a sparse file while hibernating. In the longer term (post merge!), I'd like
+   to change things so that the file could be dynamically resized and have
+   holes filled as needed. Right now, however, that's not possible and not a
+   priority.
+
+   c. Bootloader configuration.
+
+   Using TuxOnIce also requires that you add an extra parameter to
+   your lilo.conf or equivalent. Here's an example for a swap partition:
+
+   append="resume=swap:/dev/hda1"
+
+   This would tell TuxOnIce that /dev/hda1 is a swap partition you
+   have. TuxOnIce will use the swap signature of this partition as a
+   pointer to your data when you hibernate. This means that (in this example)
+   /dev/hda1 doesn't need to be _the_ swap partition where all of your data
+   is actually stored. It just needs to be a swap partition that has a
+   valid signature.
+
+   You don't need to have a swap partition for this purpose. TuxOnIce
+   can also use a swap file, but usage is a little more complex. Having made
+   your swap file, turn it on and do
+
+   cat /sys/power/tuxonice/swap/headerlocations
+
+   (this assumes you've already compiled your kernel with TuxOnIce
+   support and booted it). The results of the cat command will tell you
+   what you need to put in lilo.conf:
+
+   For swap partitions like /dev/hda1, simply use resume=/dev/hda1.
+   For swapfile `swapfile`, use resume=swap:/dev/hda2:0x242d.
+
+   If the swapfile changes for any reason (it is moved to a different
+   location, it is deleted and recreated, or the filesystem is
+   defragmented) then you will have to check
+   /sys/power/tuxonice/swap/headerlocations for a new resume_block value.
+
+   Once you've compiled and installed the kernel and adjusted your bootloader
+   configuration, you should only need to reboot for the most basic part
+   of TuxOnIce to be ready.
+
+   If you only compile in the swap allocator, or only compile in the file
+   allocator, you don't need to add the "swap:" part of the resume=
+   parameters above. resume=/dev/hda2:0x242d will work just as well. If you
+   have compiled both and your storage is on swap, you can also use this
+   format (the swap allocator is the default allocator).
+
+   When compiling your kernel, one of the options in the 'Power Management
+   Support' menu, just above the 'Enhanced Hibernation (TuxOnIce)' entry is
+   called 'Default resume partition'. This can be used to set a default value
+   for the resume= parameter.
+
+   d. The hibernate script.
+
+   Since the driver model in 2.6 kernels is still being developed, you may need
+   to do more than just configure TuxOnIce. Users of TuxOnIce usually start the
+   process via a script which prepares for the hibernation cycle, tells the
+   kernel to do its stuff and then restore things afterwards. This script might
+   involve:
+
+   - Switching to a text console and back if X doesn't like the video card
+     status on resume.
+   - Un/reloading drivers that don't play well with hibernation.
+
+   Note that you might not be able to unload some drivers if there are
+   processes using them. You might have to kill off processes that hold
+   devices open. Hint: if your X server accesses an USB mouse, doing a
+   'chvt' to a text console releases the device and you can unload the
+   module.
+
+   Check out the latest script (available on tuxonice.net).
+
+   e. The userspace user interface.
+
+   TuxOnIce has very limited support for displaying status if you only apply
+   the kernel patch - it can printk messages, but that is all. In addition,
+   some of the functions mentioned in this document (such as cancelling a cycle
+   or performing interactive debugging) are unavailable. To utilise these
+   functions, or simply get a nice display, you need the 'userui' component.
+   Userui comes in three flavours, usplash, fbsplash and text. Text should
+   work on any console. Usplash and fbsplash require the appropriate
+   (distro specific?) support.
+
+   To utilise a userui, TuxOnIce just needs to be told where to find the
+   userspace binary:
+
+   echo "/usr/local/sbin/tuxoniceui_fbsplash" > /sys/power/tuxonice/user_interface/program
+
+   The hibernate script can do this for you, and a default value for this
+   setting can be configured when compiling the kernel. This path is also
+   stored in the image header, so if you have an initrd or initramfs, you can
+   use the userui during the first part of resuming (prior to the atomic
+   restore) by putting the binary in the same path in your initrd/ramfs.
+   Alternatively, you can put it in a different location and do an echo
+   similar to the above prior to the echo > do_resume. The value saved in the
+   image header will then be ignored.
+
+4. Why not just use the version already in the kernel?
+
+   The version in the vanilla kernel has a number of drawbacks. The most
+   serious of these are:
+	- it has a maximum image size of 1/2 total memory;
+	- it doesn't allocate storage until after it has snapshotted memory.
+	  This means that you can't be sure hibernating will work until you
+	  see it start to write the image;
+	- it does not allow you to press escape to cancel a cycle;
+	- it does not allow you to press escape to cancel resuming;
+	- it does not allow you to automatically swapon a file when
+	  starting a cycle;
+	- it does not allow you to use multiple swap partitions or files;
+	- it does not allow you to use ordinary files;
+	- it just invalidates an image and continues to boot if you
+	  accidentally boot the wrong kernel after hibernating;
+	- it doesn't support any sort of nice display while hibernating;
+	- it is moving toward requiring that you have an initrd/initramfs
+	  to ever have a hope of resuming (uswsusp). While uswsusp will
+	  address some of the concerns above, it won't address all of them,
+          and will be more complicated to get set up;
+        - it doesn't have support for suspend-to-both (write a hibernation
+	  image, then suspend to ram; I think this is known as ReadySafe
+	  under M$).
+
+5. How do you use it?
+
+   A hibernation cycle can be started directly by doing:
+
+	echo > /sys/power/tuxonice/do_hibernate
+
+   In practice, though, you'll probably want to use the hibernate script
+   to unload modules, configure the kernel the way you like it and so on.
+   In that case, you'd do (as root):
+
+	hibernate
+
+   See the hibernate script's man page for more details on the options it
+   takes.
+
+   If you're using the text or splash user interface modules, one feature of
+   TuxOnIce that you might find useful is that you can press Escape at any time
+   during hibernating, and the process will be aborted.
+
+   Due to the way hibernation works, this means you'll have your system back and
+   perfectly usable almost instantly. The only exception is when it's at the
+   very end of writing the image. Then it will need to reload a small (usually
+   4-50MBs, depending upon the image characteristics) portion first.
+
+   Likewise, when resuming, you can press escape and resuming will be aborted.
+   The computer will then powerdown again according to settings at that time for
+   the powerdown method or rebooting.
+
+   You can change the settings for powering down while the image is being
+   written by pressing 'R' to toggle rebooting and 'O' to toggle between
+   suspending to ram and powering down completely).
+
+   If you run into problems with resuming, adding the "noresume" option to
+   the kernel command line will let you skip the resume step and recover your
+   system. This option shouldn't normally be needed, because TuxOnIce modifies
+   the image header prior to the atomic restore, and will thus prompt you
+   if it detects that you've tried to resume an image before (this flag is
+   removed if you press Escape to cancel a resume, so you won't be prompted
+   then).
+
+   Recent kernels (2.6.24 onwards) add support for resuming from a different
+   kernel to the one that was hibernated (thanks to Rafael for his work on
+   this - I've just embraced and enhanced the support for TuxOnIce). This
+   should further reduce the need for you to use the noresume option.
+
+6. What do all those entries in /sys/power/tuxonice do?
+
+   /sys/power/tuxonice is the directory which contains files you can use to
+   tune and configure TuxOnIce to your liking. The exact contents of
+   the directory will depend upon the version of TuxOnIce you're
+   running and the options you selected at compile time. In the following
+   descriptions, names in brackets refer to compile time options.
+   (Note that they're all dependant upon you having selected CONFIG_TUXONICE
+   in the first place!).
+
+   Since the values of these settings can open potential security risks, the
+   writeable ones are accessible only to the root user. You may want to
+   configure sudo to allow you to invoke your hibernate script as an ordinary
+   user.
+
+   - alloc/failure_test
+
+   This debugging option provides a way of testing TuxOnIce's handling of
+   memory allocation failures. Each allocation type that TuxOnIce makes has
+   been given a unique number (see the source code). Echo the appropriate
+   number into this entry, and when TuxOnIce attempts to do that allocation,
+   it will pretend there was a failure and act accordingly.
+
+   - alloc/find_max_mem_allocated
+
+   This debugging option will cause TuxOnIce to find the maximum amount of
+   memory it used during a cycle, and report that information in debugging
+   information at the end of the cycle.
+
+   - alt_resume_param
+
+   Instead of powering down after writing a hibernation image, TuxOnIce
+   supports resuming from a different image. This entry lets you set the
+   location of the signature for that image (the resume= value you'd use
+   for it). Using an alternate image and keep_image mode, you can do things
+   like using an alternate image to power down an uninterruptible power
+   supply.
+
+   - block_io/target_outstanding_io
+
+   This value controls the amount of memory that the block I/O code says it
+   needs when the core code is calculating how much memory is needed for
+   hibernating and for resuming. It doesn't directly control the amount of
+   I/O that is submitted at any one time - that depends on the amount of
+   available memory (we may have more available than we asked for), the
+   throughput that is being achieved and the ability of the CPU to keep up
+   with disk throughput (particularly where we're compressing pages).
+
+   - checksum/enabled
+
+   Use cryptoapi hashing routines to verify that Pageset2 pages don't change
+   while we're saving the first part of the image, and to get any pages that
+   do change resaved in the atomic copy. This should normally not be needed,
+   but if you're seeing issues, please enable this. If your issues stop you
+   being able to resume, enable this option, hibernate and cancel the cycle
+   after the atomic copy is done. If the debugging info shows a non-zero
+   number of pages resaved, please report this to Nigel.
+
+   - compression/algorithm
+
+   Set the cryptoapi algorithm used for compressing the image.
+
+   - compression/expected_compression
+
+   These values allow you to set an expected compression ratio, which TuxOnice
+   will use in calculating whether it meets constraints on the image size. If
+   this expected compression ratio is not attained, the hibernation cycle will
+   abort, so it is wise to allow some spare. You can see what compression
+   ratio is achieved in the logs after hibernating.
+
+   - debug_info:
+
+   This file returns information about your configuration that may be helpful
+   in diagnosing problems with hibernating.
+
+   - did_suspend_to_both:
+
+   This file can be used when you hibernate with powerdown method 3 (ie suspend
+   to ram after writing the image). There can be two outcomes in this case. We
+   can resume from the suspend-to-ram before the battery runs out, or we can run
+   out of juice and and up resuming like normal. This entry lets you find out,
+   post resume, which way we went. If the value is 1, we resumed from suspend
+   to ram. This can be useful when actions need to be run post suspend-to-ram
+   that don't need to be run if we did the normal resume from power off.
+
+   - do_hibernate:
+
+   When anything is written to this file, the kernel side of TuxOnIce will
+   begin to attempt to write an image to disk and power down. You'll normally
+   want to run the hibernate script instead, to get modules unloaded first.
+
+   - do_resume:
+
+   When anything is written to this file TuxOnIce will attempt to read and
+   restore an image. If there is no image, it will return almost immediately.
+   If an image exists, the echo > will never return. Instead, the original
+   kernel context will be restored and the original echo > do_hibernate will
+   return.
+
+   - */enabled
+
+   These option can be used to temporarily disable various parts of TuxOnIce.
+
+   - extra_pages_allowance
+
+   When TuxOnIce does its atomic copy, it calls the driver model suspend
+   and resume methods. If you have DRI enabled with a driver such as fglrx,
+   this can result in the driver allocating a substantial amount of memory
+   for storing its state. Extra_pages_allowance tells TuxOnIce how much
+   extra memory it should ensure is available for those allocations. If
+   your attempts at hibernating end with a message in dmesg indicating that
+   insufficient extra pages were allowed, you need to increase this value.
+
+   - file/target:
+
+   Read this value to get the current setting. Write to it to point TuxOnice
+   at a new storage location for the file allocator. See section 3.b.ii above
+   for details of how to set up the file allocator.
+
+   - freezer_test
+
+   This entry can be used to get TuxOnIce to just test the freezer and prepare
+   an image without actually doing a hibernation cycle. It is useful for
+   diagnosing freezing and image preparation issues.
+
+   - full_pageset2
+
+   TuxOnIce divides the pages that are stored in an image into two sets. The
+   difference between the two sets is that pages in pageset 1 are atomically
+   copied, and pages in pageset 2 are written to disk without being copied
+   first. A page CAN be written to disk without being copied first if and only
+   if its contents will not be modified or used at any time after userspace
+   processes are frozen. A page MUST be in pageset 1 if its contents are
+   modified or used at any time after userspace processes have been frozen.
+
+   Normally (ie if this option is enabled), TuxOnIce will put all pages on the
+   per-zone LRUs in pageset2, then remove those pages used by any userspace
+   user interface helper and TuxOnIce storage manager that are running,
+   together with pages used by the GEM memory manager introduced around 2.6.28
+   kernels.
+
+   If this option is disabled, a much more conservative approach will be taken.
+   The only pages in pageset2 will be those belonging to userspace processes,
+   with the exclusion of those belonging to the TuxOnIce userspace helpers
+   mentioned above. This will result in a much smaller pageset2, and will
+   therefore result in smaller images than are possible with this option
+   enabled.
+
+   - ignore_rootfs
+
+   TuxOnIce records which device is mounted as the root filesystem when
+   writing the hibernation image. It will normally check at resume time that
+   this device isn't already mounted - that would be a cause of filesystem
+   corruption. In some particular cases (RAM based root filesystems), you
+   might want to disable this check. This option allows you to do that.
+
+   - image_exists:
+
+   Can be used in a script to determine whether a valid image exists at the
+   location currently pointed to by resume=. Returns up to three lines.
+   The first is whether an image exists (-1 for unsure, otherwise 0 or 1).
+   If an image eixsts, additional lines will return the machine and version.
+   Echoing anything to this entry removes any current image.
+
+   - image_size_limit:
+
+   The maximum size of hibernation image written to disk, measured in megabytes
+   (1024*1024).
+
+   - last_result:
+
+   The result of the last hibernation cycle, as defined in
+   include/linux/suspend-debug.h with the values SUSPEND_ABORTED to
+   SUSPEND_KEPT_IMAGE. This is a bitmask.
+
+   - late_cpu_hotplug:
+
+   This sysfs entry controls whether cpu hotplugging is done - as normal - just
+   before (unplug) and after (replug) the atomic copy/restore (so that all
+   CPUs/cores are available for multithreaded I/O). The alternative is to
+   unplug all secondary CPUs/cores at the start of hibernating/resuming, and
+   replug them at the end of resuming. No multithreaded I/O will be possible in
+   this configuration, but the odd machine has been reported to require it.
+
+   - lid_file:
+
+   This determines which ACPI button file we look in to determine whether the
+   lid is open or closed after resuming from suspend to disk or power off.
+   If the entry is set to "lid/LID", we'll open /proc/acpi/button/lid/LID/state
+   and check its contents at the appropriate moment. See post_wake_state below
+   for more details on how this entry is used.
+
+   - log_everything (CONFIG_PM_DEBUG):
+
+   Setting this option results in all messages printed being logged. Normally,
+   only a subset are logged, so as to not slow the process and not clutter the
+   logs. Useful for debugging. It can be toggled during a cycle by pressing
+   'L'.
+
+   - no_load_direct:
+
+   This is a debugging option. If, when loading the atomically copied pages of
+   an image, TuxOnIce finds that the destination address for a page is free,
+   it will normally allocate the image, load the data directly into that
+   address and skip it in the atomic restore. If this option is disabled, the
+   page will be loaded somewhere else and atomically restored like other pages.
+
+   - no_flusher_thread:
+
+   When doing multithreaded I/O (see below), the first online CPU can be used
+   to _just_ submit compressed pages when writing the image, rather than
+   compressing and submitting data. This option is normally disabled, but has
+   been included because Nigel would like to see whether it will be more useful
+   as the number of cores/cpus in computers increases.
+
+   - no_multithreaded_io:
+
+   TuxOnIce will normally create one thread per cpu/core on your computer,
+   each of which will then perform I/O. This will generally result in
+   throughput that's the maximum the storage medium can handle. There
+   shouldn't be any reason to disable multithreaded I/O now, but this option
+   has been retained for debugging purposes.
+
+   - no_pageset2
+
+   See the entry for full_pageset2 above for an explanation of pagesets.
+   Enabling this option causes TuxOnIce to do an atomic copy of all pages,
+   thereby limiting the maximum image size to 1/2 of memory, as swsusp does.
+
+   - no_pageset2_if_unneeded
+
+   See the entry for full_pageset2 above for an explanation of pagesets.
+   Enabling this option causes TuxOnIce to act like no_pageset2 was enabled
+   if and only it isn't needed anyway. This option may still make TuxOnIce
+   less reliable because pageset2 pages are normally used to store the
+   atomic copy - drivers that want to do allocations of larger amounts of
+   memory in one shot will be more likely to find that those amounts aren't
+   available if this option is enabled.
+
+   - pause_between_steps (CONFIG_PM_DEBUG):
+
+   This option is used during debugging, to make TuxOnIce pause between
+   each step of the process. It is ignored when the nice display is on.
+
+   - post_wake_state:
+
+   TuxOnIce provides support for automatically waking after a user-selected
+   delay, and using a different powerdown method if the lid is still closed.
+   (Yes, we're assuming a laptop).  This entry lets you choose what state
+   should be entered next. The values are those described under
+   powerdown_method, below. It can be used to suspend to RAM after hibernating,
+   then powerdown properly (say) 20 minutes. It can also be used to power down
+   properly, then wake at (say) 6.30am and suspend to RAM until you're ready
+   to use the machine.
+
+   - powerdown_method:
+
+   Used to select a method by which TuxOnIce should powerdown after writing the
+   image. Currently:
+
+   0: Don't use ACPI to power off.
+   3: Attempt to enter Suspend-to-ram.
+   4: Attempt to enter ACPI S4 mode.
+   5: Attempt to power down via ACPI S5 mode.
+
+   Note that these options are highly dependant upon your hardware & software:
+
+   3: When succesful, your machine suspends to ram instead of powering off.
+      The advantage of using this mode is that it doesn't matter whether your
+      battery has enough charge to make it through to your next resume. If it
+      lasts, you will simply resume from suspend to ram (and the image on disk
+      will be discarded). If the battery runs out, you will resume from disk
+      instead. The disadvantage is that it takes longer than a normal
+      suspend-to-ram to enter the state, since the suspend-to-disk image needs
+      to be written first.
+   4/5: When successful, your machine will be off and comsume (almost) no power.
+      But it might still react to some external events like opening the lid or
+      trafic on  a network or usb device. For the bios, resume is then the same
+      as warm boot, similar to a situation where you used the command `reboot'
+      to reboot your machine. If your machine has problems on warm boot or if
+      you want to protect your machine with the bios password, this is probably
+      not the right choice. Mode 4 may be necessary on some machines where ACPI
+      wake up methods need to be run to properly reinitialise hardware after a
+      hibernation cycle.
+   0: Switch the machine completely off. The only possible wakeup is the power
+      button. For the bios, resume is then the same as a cold boot, in
+      particular you would  have to provide your bios boot password if your
+      machine uses that feature for booting.
+
+   - progressbar_granularity_limit:
+
+   This option can be used to limit the granularity of the progress bar
+   displayed with a bootsplash screen. The value is the maximum number of
+   steps. That is, 10 will make the progress bar jump in 10% increments.
+
+   - reboot:
+
+   This option causes TuxOnIce to reboot rather than powering down
+   at the end of saving an image. It can be toggled during a cycle by pressing
+   'R'.
+
+   - resume:
+
+   This sysfs entry can be used to read and set the location in which TuxOnIce
+   will look for the signature of an image - the value set using resume= at
+   boot time or CONFIG_PM_STD_PARTITION ("Default resume partition"). By
+   writing to this file as well as modifying your bootloader's configuration
+   file (eg menu.lst), you can set or reset the location of your image or the
+   method of storing the image without rebooting.
+
+   - replace_swsusp (CONFIG_TOI_REPLACE_SWSUSP):
+
+   This option makes
+
+     echo disk > /sys/power/state
+
+   activate TuxOnIce instead of swsusp. Regardless of whether this option is
+   enabled, any invocation of swsusp's resume time trigger will cause TuxOnIce
+   to check for an image too. This is due to the fact that at resume time, we
+   can't know whether this option was enabled until we see if an image is there
+   for us to resume from. (And when an image exists, we don't care whether we
+   did replace swsusp anyway - we just want to resume).
+
+   - resume_commandline:
+
+   This entry can be read after resuming to see the commandline that was used
+   when resuming began. You might use this to set up two bootloader entries
+   that are the same apart from the fact that one includes a extra append=
+   argument "at_work=1". You could then grep resume_commandline in your
+   post-resume scripts and configure networking (for example) differently
+   depending upon whether you're at home or work. resume_commandline can be
+   set to arbitrary text if you wish to remove sensitive contents.
+
+   - swap/swapfilename:
+
+   This entry is used to specify the swapfile or partition that
+   TuxOnIce will attempt to swapon/swapoff automatically. Thus, if
+   I normally use /dev/hda1 for swap, and want to use /dev/hda2 for specifically
+   for my hibernation image, I would
+
+   echo /dev/hda2 > /sys/power/tuxonice/swap/swapfile
+
+   /dev/hda2 would then be automatically swapon'd and swapoff'd. Note that the
+   swapon and swapoff occur while other processes are frozen (including kswapd)
+   so this swap file will not be used up when attempting to free memory. The
+   parition/file is also given the highest priority, so other swapfiles/partitions
+   will only be used to save the image when this one is filled.
+
+   The value of this file is used by headerlocations along with any currently
+   activated swapfiles/partitions.
+
+   - swap/headerlocations:
+
+   This option tells you the resume= options to use for swap devices you
+   currently have activated. It is particularly useful when you only want to
+   use a swap file to store your image. See above for further details.
+
+   - test_bio
+
+   This is a debugging option. When enabled, TuxOnIce will not hibernate.
+   Instead, when asked to write an image, it will skip the atomic copy,
+   just doing the writing of the image and then returning control to the
+   user at the point where it would have powered off. This is useful for
+   testing throughput in different configurations.
+
+   - test_filter_speed
+
+   This is a debugging option. When enabled, TuxOnIce will not hibernate.
+   Instead, when asked to write an image, it will not write anything or do
+   an atomic copy, but will only run any enabled compression algorithm on the
+   data that would have been written (the source pages of the atomic copy in
+   the case of pageset 1). This is useful for comparing the performance of
+   compression algorithms and for determining the extent to which an upgrade
+   to your storage method would improve hibernation speed.
+
+   - user_interface/debug_sections (CONFIG_PM_DEBUG):
+
+   This value, together with the console log level, controls what debugging
+   information is displayed. The console log level determines the level of
+   detail, and this value determines what detail is displayed. This value is
+   a bit vector, and the meaning of the bits can be found in the kernel tree
+   in include/linux/tuxonice.h. It can be overridden using the kernel's
+   command line option suspend_dbg.
+
+   - user_interface/default_console_level (CONFIG_PM_DEBUG):
+
+   This determines the value of the console log level at the start of a
+   hibernation cycle. If debugging is compiled in, the console log level can be
+   changed during a cycle by pressing the digit keys. Meanings are:
+
+   0: Nice display.
+   1: Nice display plus numerical progress.
+   2: Errors only.
+   3: Low level debugging info.
+   4: Medium level debugging info.
+   5: High level debugging info.
+   6: Verbose debugging info.
+
+   - user_interface/enable_escape:
+
+   Setting this to "1" will enable you abort a hibernation cycle or resuming by
+   pressing escape, "0" (default) disables this feature. Note that enabling
+   this option means that you cannot initiate a hibernation cycle and then walk
+   away from your computer, expecting it to be secure. With feature disabled,
+   you can validly have this expectation once TuxOnice begins to write the
+   image to disk. (Prior to this point, it is possible that TuxOnice might
+   about because of failure to freeze all processes or because constraints
+   on its ability to save the image are not met).
+
+   - user_interface/program
+
+   This entry is used to tell TuxOnice what userspace program to use for
+   providing a user interface while hibernating. The program uses a netlink
+   socket to pass messages back and forward to the kernel, allowing all of the
+   functions formerly implemented in the kernel user interface components.
+
+   - version:
+
+   The version of TuxOnIce you have compiled into the currently running kernel.
+
+   - wake_alarm_dir:
+
+   As mentioned above (post_wake_state), TuxOnIce supports automatically waking
+   after some delay. This entry allows you to select which wake alarm to use.
+   It should contain the value "rtc0" if you're wanting to use
+   /sys/class/rtc/rtc0.
+
+   - wake_delay:
+
+   This value determines the delay from the end of writing the image until the
+   wake alarm is triggered. You can set an absolute time by writing the desired
+   time into /sys/class/rtc/<wake_alarm_dir>/wakealarm and leaving these values
+   empty.
+
+   Note that for the wakeup to actually occur, you may need to modify entries
+   in /proc/acpi/wakeup. This is done by echoing the name of the button in the
+   first column (eg PBTN) into the file.
+
+7. How do you get support?
+
+   Glad you asked. TuxOnIce is being actively maintained and supported
+   by Nigel (the guy doing most of the kernel coding at the moment), Bernard
+   (who maintains the hibernate script and userspace user interface components)
+   and its users.
+
+   Resources availble include HowTos, FAQs and a Wiki, all available via
+   tuxonice.net.  You can find the mailing lists there.
+
+8. I think I've found a bug. What should I do?
+
+   By far and a way, the most common problems people have with TuxOnIce
+   related to drivers not having adequate power management support. In this
+   case, it is not a bug with TuxOnIce, but we can still help you. As we
+   mentioned above, such issues can usually be worked around by building the
+   functionality as modules and unloading them while hibernating. Please visit
+   the Wiki for up-to-date lists of known issues and work arounds.
+
+   If this information doesn't help, try running:
+
+   hibernate --bug-report
+
+   ..and sending the output to the users mailing list.
+
+   Good information on how to provide us with useful information from an
+   oops is found in the file REPORTING-BUGS, in the top level directory
+   of the kernel tree. If you get an oops, please especially note the
+   information about running what is printed on the screen through ksymoops.
+   The raw information is useless.
+
+9. When will XXX be supported?
+
+   If there's a feature missing from TuxOnIce that you'd like, feel free to
+   ask. We try to be obliging, within reason.
+
+   Patches are welcome. Please send to the list.
+
+10. How does it work?
+
+   TuxOnIce does its work in a number of steps.
+
+   a. Freezing system activity.
+
+   The first main stage in hibernating is to stop all other activity. This is
+   achieved in stages. Processes are considered in fours groups, which we will
+   describe in reverse order for clarity's sake: Threads with the PF_NOFREEZE
+   flag, kernel threads without this flag, userspace processes with the
+   PF_SYNCTHREAD flag and all other processes. The first set (PF_NOFREEZE) are
+   untouched by the refrigerator code. They are allowed to run during hibernating
+   and resuming, and are used to support user interaction, storage access or the
+   like. Other kernel threads (those unneeded while hibernating) are frozen last.
+   This leaves us with userspace processes that need to be frozen. When a
+   process enters one of the *_sync system calls, we set a PF_SYNCTHREAD flag on
+   that process for the duration of that call. Processes that have this flag are
+   frozen after processes without it, so that we can seek to ensure that dirty
+   data is synced to disk as quickly as possible in a situation where other
+   processes may be submitting writes at the same time. Freezing the processes
+   that are submitting data stops new I/O from being submitted. Syncthreads can
+   then cleanly finish their work. So the order is:
+
+   - Userspace processes without PF_SYNCTHREAD or PF_NOFREEZE;
+   - Userspace processes with PF_SYNCTHREAD (they won't have NOFREEZE);
+   - Kernel processes without PF_NOFREEZE.
+
+   b. Eating memory.
+
+   For a successful hibernation cycle, you need to have enough disk space to store the
+   image and enough memory for the various limitations of TuxOnIce's
+   algorithm. You can also specify a maximum image size. In order to attain
+   to those constraints, TuxOnIce may 'eat' memory. If, after freezing
+   processes, the constraints aren't met, TuxOnIce will thaw all the
+   other processes and begin to eat memory until its calculations indicate
+   the constraints are met. It will then freeze processes again and recheck
+   its calculations.
+
+   c. Allocation of storage.
+
+   Next, TuxOnIce allocates the storage that will be used to save
+   the image.
+
+   The core of TuxOnIce knows nothing about how or where pages are stored. We
+   therefore request the active allocator (remember you might have compiled in
+   more than one!) to allocate enough storage for our expect image size. If
+   this request cannot be fulfilled, we eat more memory and try again. If it
+   is fulfiled, we seek to allocate additional storage, just in case our
+   expected compression ratio (if any) isn't achieved. This time, however, we
+   just continue if we can't allocate enough storage.
+
+   If these calls to our allocator change the characteristics of the image
+   such that we haven't allocated enough memory, we also loop. (The allocator
+   may well need to allocate space for its storage information).
+
+   d. Write the first part of the image.
+
+   TuxOnIce stores the image in two sets of pages called 'pagesets'.
+   Pageset 2 contains pages on the active and inactive lists; essentially
+   the page cache. Pageset 1 contains all other pages, including the kernel.
+   We use two pagesets for one important reason: We need to make an atomic copy
+   of the kernel to ensure consistency of the image. Without a second pageset,
+   that would limit us to an image that was at most half the amount of memory
+   available. Using two pagesets allows us to store a full image. Since pageset
+   2 pages won't be needed in saving pageset 1, we first save pageset 2 pages.
+   We can then make our atomic copy of the remaining pages using both pageset 2
+   pages and any other pages that are free. While saving both pagesets, we are
+   careful not to corrupt the image. Among other things, we use lowlevel block
+   I/O routines that don't change the pagecache contents.
+
+   The next step, then, is writing pageset 2.
+
+   e. Suspending drivers and storing processor context.
+
+   Having written pageset2, TuxOnIce calls the power management functions to
+   notify drivers of the hibernation, and saves the processor state in preparation
+   for the atomic copy of memory we are about to make.
+
+   f. Atomic copy.
+
+   At this stage, everything else but the TuxOnIce code is halted. Processes
+   are frozen or idling, drivers are quiesced and have stored (ideally and where
+   necessary) their configuration in memory we are about to atomically copy.
+   In our lowlevel architecture specific code, we have saved the CPU state.
+   We can therefore now do our atomic copy before resuming drivers etc.
+
+   g. Save the atomic copy (pageset 1).
+
+   TuxOnice can then write the atomic copy of the remaining pages. Since we
+   have copied the pages into other locations, we can continue to use the
+   normal block I/O routines without fear of corruption our image.
+
+   f. Save the image header.
+
+   Nearly there! We save our settings and other parameters needed for
+   reloading pageset 1 in an 'image header'. We also tell our allocator to
+   serialise its data at this stage, so that it can reread the image at resume
+   time.
+
+   g. Set the image header.
+
+   Finally, we edit the header at our resume= location. The signature is
+   changed by the allocator to reflect the fact that an image exists, and to
+   point to the start of that data if necessary (swap allocator).
+
+   h. Power down.
+
+   Or reboot if we're debugging and the appropriate option is selected.
+
+   Whew!
+
+   Reloading the image.
+   --------------------
+
+   Reloading the image is essentially the reverse of all the above. We load
+   our copy of pageset 1, being careful to choose locations that aren't going
+   to be overwritten as we copy it back (We start very early in the boot
+   process, so there are no other processes to quiesce here). We then copy
+   pageset 1 back to its original location in memory and restore the process
+   context. We are now running with the original kernel. Next, we reload the
+   pageset 2 pages, free the memory and swap used by TuxOnIce, restore
+   the pageset header and restart processes. Sounds easy in comparison to
+   hibernating, doesn't it!
+
+   There is of course more to TuxOnIce than this, but this explanation
+   should be a good start. If there's interest, I'll write further
+   documentation on range pages and the low level I/O.
+
+11. Who wrote TuxOnIce?
+
+   (Answer based on the writings of Florent Chabaud, credits in files and
+   Nigel's limited knowledge; apologies to anyone missed out!)
+
+   The main developers of TuxOnIce have been...
+
+   Gabor Kuti
+   Pavel Machek
+   Florent Chabaud
+   Bernard Blackham
+   Nigel Cunningham
+
+   Significant portions of swsusp, the code in the vanilla kernel which
+   TuxOnIce enhances, have been worked on by Rafael Wysocki. Thanks should
+   also be expressed to him.
+
+   The above mentioned developers have been aided in their efforts by a host
+   of hundreds, if not thousands of testers and people who have submitted bug
+   fixes & suggestions. Of special note are the efforts of Michael Frank, who
+   had his computers repetitively hibernate and resume for literally tens of
+   thousands of cycles and developed scripts to stress the system and test
+   TuxOnIce far beyond the point most of us (Nigel included!) would consider
+   testing. His efforts have contributed as much to TuxOnIce as any of the
+   names above.
diff --git b/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt
new file mode 100644
index 0000000..77a1e37
--- /dev/null
+++ b/Documentation/scheduler/sched-BFS.txt
@@ -0,0 +1,357 @@
+BFS - The Brain Fuck Scheduler by Con Kolivas.
+
+Goals.
+
+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to
+completely do away with the complex designs of the past for the cpu process
+scheduler and instead implement one that is very simple in basic design.
+The main focus of BFS is to achieve excellent desktop interactivity and
+responsiveness without heuristics and tuning knobs that are difficult to
+understand, impossible to model and predict the effect of, and when tuned to
+one workload cause massive detriment to another.
+
+
+Design summary.
+
+BFS is best described as a single runqueue, O(n) lookup, earliest effective
+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual
+deadline first) and my previous Staircase Deadline scheduler. Each component
+shall be described in order to understand the significance of, and reasoning for
+it. The codebase when the first stable version was released was approximately
+9000 lines less code than the existing mainline linux kernel scheduler (in
+2.6.31). This does not even take into account the removal of documentation and
+the cgroups code that is not used.
+
+Design reasoning.
+
+The single runqueue refers to the queued but not running processes for the
+entire system, regardless of the number of CPUs. The reason for going back to
+a single runqueue design is that once multiple runqueues are introduced,
+per-CPU or otherwise, there will be complex interactions as each runqueue will
+be responsible for the scheduling latency and fairness of the tasks only on its
+own runqueue, and to achieve fairness and low latency across multiple CPUs, any
+advantage in throughput of having CPU local tasks causes other disadvantages.
+This is due to requiring a very complex balancing system to at best achieve some
+semblance of fairness across CPUs and can only maintain relatively low latency
+for tasks bound to the same CPUs, not across them. To increase said fairness
+and latency across CPUs, the advantage of local runqueue locking, which makes
+for better scalability, is lost due to having to grab multiple locks.
+
+A significant feature of BFS is that all accounting is done purely based on CPU
+used and nowhere is sleep time used in any way to determine entitlement or
+interactivity. Interactivity "estimators" that use some kind of sleep/run
+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag
+tasks that aren't interactive as being so. The reason for this is that it is
+close to impossible to determine that when a task is sleeping, whether it is
+doing it voluntarily, as in a userspace application waiting for input in the
+form of a mouse click or otherwise, or involuntarily, because it is waiting for
+another thread, process, I/O, kernel activity or whatever. Thus, such an
+estimator will introduce corner cases, and more heuristics will be required to
+cope with those corner cases, introducing more corner cases and failed
+interactivity detection and so on. Interactivity in BFS is built into the design
+by virtue of the fact that tasks that are waking up have not used up their quota
+of CPU time, and have earlier effective deadlines, thereby making it very likely
+they will preempt any CPU bound task of equivalent nice level. See below for
+more information on the virtual deadline mechanism. Even if they do not preempt
+a running task, because the rr interval is guaranteed to have a bound upper
+limit on how long a task will wait for, it will be scheduled within a timeframe
+that will not cause visible interface jitter.
+
+
+Design details.
+
+Task insertion.
+
+BFS inserts tasks into each relevant queue as an O(1) insertion into a double
+linked list. On insertion, *every* running queue is checked to see if the newly
+queued task can run on any idle queue, or preempt the lowest running task on the
+system. This is how the cross-CPU scheduling of BFS achieves significantly lower
+latency per extra CPU the system has. In this case the lookup is, in the worst
+case scenario, O(n) where n is the number of CPUs on the system.
+
+Data protection.
+
+BFS has one single lock protecting the process local data of every task in the
+global queue. Thus every insertion, removal and modification of task data in the
+global runqueue needs to grab the global lock. However, once a task is taken by
+a CPU, the CPU has its own local data copy of the running process' accounting
+information which only that CPU accesses and modifies (such as during a
+timer tick) thus allowing the accounting data to be updated lockless. Once a
+CPU has taken a task to run, it removes it from the global queue. Thus the
+global queue only ever has, at most,
+
+	(number of tasks requesting cpu time) - (number of logical CPUs) + 1
+
+tasks in the global queue. This value is relevant for the time taken to look up
+tasks during scheduling. This will increase if many tasks with CPU affinity set
+in their policy to limit which CPUs they're allowed to run on if they outnumber
+the number of CPUs. The +1 is because when rescheduling a task, the CPU's
+currently running task is put back on the queue. Lookup will be described after
+the virtual deadline mechanism is explained.
+
+Virtual deadline.
+
+The key to achieving low latency, scheduling fairness, and "nice level"
+distribution in BFS is entirely in the virtual deadline mechanism. The related
+tunable in BFS is the rr_interval, or "round robin interval". This is the
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
+tasks of the same nice level will be running for, or looking at it the other
+way around, the longest duration two tasks of the same nice level will be
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
+equal to the rr_interval and a virtual deadline. The virtual deadline is
+offset from the current time in jiffies by this equation:
+
+	jiffies + (prio_ratio * rr_interval)
+
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
+and increases by 10% per nice level. The deadline is a virtual one only in that
+no guarantee is placed that a task will actually be scheduled by this time, but
+it is used to compare which task should go next. There are three components to
+how a task is next chosen. First is time_slice expiration. If a task runs out
+of its time_slice, it is descheduled, the time_slice is refilled, and the
+deadline reset to that formula above. Second is sleep, where a task no longer
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
+adjusted in this case and are just carried over for when the task is next
+scheduled. Third is preemption, and that is when a newly waking task is deemed
+higher priority than a currently running task on any cpu by virtue of the fact
+that it has an earlier virtual deadline than the currently running task. The
+earlier deadline is the key to which task is next chosen for the first and
+second cases. Once a task is descheduled, it is put back on the queue, and an
+O(n) lookup of all queued-but-not-running tasks is done to determine which has
+the earliest deadline and that task is chosen to receive CPU next.
+
+The CPU proportion of different nice tasks works out to be approximately the
+
+	(prio_ratio difference)^2
+
+The reason it is squared is that a task's deadline does not change while it is
+running unless it runs out of time_slice. Thus, even if the time actually
+passes the deadline of another task that is queued, it will not get CPU time
+unless the current running task deschedules, and the time "base" (jiffies) is
+constantly moving.
+
+Task lookup.
+
+BFS has 103 priority queues. 100 of these are dedicated to the static priority
+of realtime tasks, and the remaining 3 are, in order of best to worst priority,
+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority
+scheduling). When a task of these priorities is queued, a bitmap of running
+priorities is set showing which of these priorities has tasks waiting for CPU
+time. When a CPU is made to reschedule, the lookup for the next task to get
+CPU time is performed in the following way:
+
+First the bitmap is checked to see what static priority tasks are queued. If
+any realtime priorities are found, the corresponding queue is checked and the
+first task listed there is taken (provided CPU affinity is suitable) and lookup
+is complete. If the priority corresponds to a SCHED_ISO task, they are also
+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds
+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this
+stage, every task in the runlist that corresponds to that priority is checked
+to see which has the earliest set deadline, and (provided it has suitable CPU
+affinity) it is taken off the runqueue and given the CPU. If a task has an
+expired deadline, it is taken and the rest of the lookup aborted (as they are
+chosen in FIFO order).
+
+Thus, the lookup is O(n) in the worst case only, where n is as described
+earlier, as tasks may be chosen before the whole task list is looked over.
+
+
+Scalability.
+
+The major limitations of BFS will be that of scalability, as the separate
+runqueue designs will have less lock contention as the number of CPUs rises.
+However they do not scale linearly even with separate runqueues as multiple
+runqueues will need to be locked concurrently on such designs to be able to
+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness
+across CPUs, and to achieve low enough latency for tasks on a busy CPU when
+other CPUs would be more suited. BFS has the advantage that it requires no
+balancing algorithm whatsoever, as balancing occurs by proxy simply because
+all CPUs draw off the global runqueue, in priority and deadline order. Despite
+the fact that scalability is _not_ the prime concern of BFS, it both shows very
+good scalability to smaller numbers of CPUs and is likely a more scalable design
+at these numbers of CPUs.
+
+It also has some very low overhead scalability features built into the design
+when it has been deemed their overhead is so marginal that they're worth adding.
+The first is the local copy of the running process' data to the CPU it's running
+on to allow that data to be updated lockless where possible. Then there is
+deference paid to the last CPU a task was running on, by trying that CPU first
+when looking for an idle CPU to use the next time it's scheduled. Finally there
+is the notion of "sticky" tasks that are flagged when they are involuntarily
+descheduled, meaning they still want further CPU time. This sticky flag is
+used to bias heavily against those tasks being scheduled on a different CPU
+unless that CPU would be otherwise idle. When a cpu frequency governor is used
+that scales with CPU load, such as ondemand, sticky tasks are not scheduled
+on a different CPU at all, preferring instead to go idle. This means the CPU
+they were bound to is more likely to increase its speed while the other CPU
+will go idle, thus speeding up total task execution time and likely decreasing
+power usage. This is the only scenario where BFS will allow a CPU to go idle
+in preference to scheduling a task on the earliest available spare CPU.
+
+The real cost of migrating a task from one CPU to another is entirely dependant
+on the cache footprint of the task, how cache intensive the task is, how long
+it's been running on that CPU to take up the bulk of its cache, how big the CPU
+cache is, how fast and how layered the CPU cache is, how fast a context switch
+is... and so on. In other words, it's close to random in the real world where we
+do more than just one sole workload. The only thing we can be sure of is that
+it's not free. So BFS uses the principle that an idle CPU is a wasted CPU and
+utilising idle CPUs is more important than cache locality, and cache locality
+only plays a part after that.
+
+When choosing an idle CPU for a waking task, the cache locality is determined
+according to where the task last ran and then idle CPUs are ranked from best
+to worst to choose the most suitable idle CPU based on cache locality, NUMA
+node locality and hyperthread sibling business. They are chosen in the
+following preference (if idle):
+
+* Same core, idle or busy cache, idle threads
+* Other core, same cache, idle or busy cache, idle threads.
+* Same node, other CPU, idle cache, idle threads.
+* Same node, other CPU, busy cache, idle threads.
+* Same core, busy threads.
+* Other core, same cache, busy threads.
+* Same node, other CPU, busy threads.
+* Other node, other CPU, idle cache, idle threads.
+* Other node, other CPU, busy cache, idle threads.
+* Other node, other CPU, busy threads.
+
+This shows the SMT or "hyperthread" awareness in the design as well which will
+choose a real idle core first before a logical SMT sibling which already has
+tasks on the physical CPU.
+
+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark.
+However this benchmarking was performed on an earlier design that was far less
+scalable than the current one so it's hard to know how scalable it is in terms
+of both CPUs (due to the global runqueue) and heavily loaded machines (due to
+O(n) lookup) at this stage. Note that in terms of scalability, the number of
+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x)
+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark
+results are very promising indeed. Benchmark contributions are most welcome.
+
+
+Features
+
+As the initial prime target audience for BFS was the average desktop user, it
+was designed to not need tweaking, tuning or have features set to obtain benefit
+from it. Thus the number of knobs and features has been kept to an absolute
+minimum and should not require extra user input for the vast majority of cases.
+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval
+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition
+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is
+support for CGROUPS. The average user should neither need to know what these
+are, nor should they need to be using them to have good desktop behaviour.
+
+There are two "scheduler" tunables, the round robin interval and the
+interactive flag. These can be accessed in
+
+	/proc/sys/kernel/rr_interval
+	/proc/sys/kernel/interactive
+
+rr_interval value
+
+The value is in milliseconds, and the default value is set to 6ms. Valid values
+are from 1 to 1000. Decreasing the value will decrease latencies at the cost of
+decreasing throughput, while increasing it will improve throughput, but at the
+cost of worsening latencies. The accuracy of the rr interval is limited by HZ
+resolution of the kernel configuration. Thus, the worst case latencies are
+usually slightly higher than this actual value. BFS uses "dithering" to try and
+minimise the effect the Hz limitation has. The default value of 6 is not an
+arbitrary one. It is based on the fact that humans can detect jitter at
+approximately 7ms, so aiming for much lower latencies is pointless under most
+circumstances. It is worth noting this fact when comparing the latency
+performance of BFS to other schedulers. Worst case latencies being higher than
+7ms are far worse than average latencies not being in the microsecond range.
+Experimentation has shown that rr intervals being increased up to 300 can
+improve throughput but beyond that, scheduling noise from elsewhere prevents
+further demonstrable throughput.
+
+interactive flag
+
+This is a simple boolean that can be set to 1 or 0, set to 1 by default. This
+sacrifices some of the interactive performance by giving tasks a degree of
+soft affinity for logical CPUs when it will lead to improved throughput, but
+enabling it also sacrifices the completely deterministic nature with respect
+to latency that BFS otherwise normally provides, and subsequently leads to
+slightly higher latencies and a noticeably less interactive system.
+
+
+Isochronous scheduling.
+
+Isochronous scheduling is a unique scheduling policy designed to provide
+near-real-time performance to unprivileged (ie non-root) users without the
+ability to starve the machine indefinitely. Isochronous tasks (which means
+"same time") are set using, for example, the schedtool application like so:
+
+	schedtool -I -e amarok
+
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
+rate). However if ISO tasks run for more than a tunable finite amount of time,
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
+time is the percentage of _total CPU_ available across the machine, configurable
+as a percentage in the following "resource handling" tunable (as opposed to a
+scheduler tunable):
+
+	/proc/sys/kernel/iso_cpu
+
+and is set to 70% by default. It is calculated over a rolling 5 second average
+Because it is the total CPU available, it means that on a multi CPU machine, it
+is possible to have an ISO task running as realtime scheduling indefinitely on
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
+ability to run any pseudo-realtime tasks.
+
+A feature of BFS is that it detects when an application tries to obtain a
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
+appropriate privileges to use those policies. When it detects this, it will
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
+Because some applications constantly set their policy as well as their nice
+level, there is potential for them to undo the override specified by the user
+on the command line of setting the policy to SCHED_ISO. To counter this, once
+a task has been set to SCHED_ISO policy, it needs superuser privileges to set
+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child
+processes and threads will also inherit the ISO policy.
+
+Idleprio scheduling.
+
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
+ultra low priority tasks to be run in the background that have virtually no
+effect on the foreground tasks. This is ideally suited to distributed computing
+clients (like setiathome, folding, mprime etc) but can also be used to start
+a video encode or so on without any slowdown of other tasks. To avoid this
+policy from grabbing shared resources and holding them indefinitely, if it
+detects a state where the task is waiting on I/O, the machine is about to
+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As
+per the Isochronous task management, once a task has been scheduled as IDLEPRIO,
+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can
+be set to start as SCHED_IDLEPRIO with the schedtool command like so:
+
+	schedtool -D -e ./mprime
+
+Subtick accounting.
+
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
+the accounting is done by simply determining what is happening at the precise
+moment a timer tick fires off. This becomes increasingly inaccurate as the
+timer tick frequency (HZ) is lowered. It is possible to create an application
+which uses almost 100% CPU, yet by being descheduled at the right time, records
+zero CPU usage. While the main problem with this is that there are possible
+security implications, it is also difficult to determine how much CPU a task
+really does use. BFS tries to use the sub-tick accounting from the TSC clock,
+where possible, to determine real CPU usage. This is not entirely reliable, but
+is far more likely to produce accurate CPU usage data than the existing designs
+and will not show tasks as consuming no CPU usage when they actually are. Thus,
+the amount of CPU reported as being used by BFS will more accurately represent
+how much CPU the task itself is using (as is shown for example by the 'time'
+application), so the reported values may be quite different to other schedulers.
+Values reported as the 'load' are more prone to problems with this design, but
+per process values are closer to real usage. When comparing throughput of BFS
+to other designs, it is important to compare the actual completed work in terms
+of total wall clock time taken and total work done, rather than the reported
+"cpu usage".
+
+
+Con Kolivas <kernel@kolivas.org> Tue, 5 Apr 2011
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index a93b414..f77a251 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -39,6 +39,7 @@ show up in /proc/sys/kernel:
 - hung_task_timeout_secs
 - hung_task_warnings
 - kexec_load_disabled
+- iso_cpu
 - kptr_restrict
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
@@ -67,6 +68,7 @@ show up in /proc/sys/kernel:
 - randomize_va_space
 - real-root-dev               ==> Documentation/initrd.txt
 - reboot-cmd                  [ SPARC only ]
+- rr_interval
 - rtsig-max
 - rtsig-nr
 - sem
@@ -396,6 +398,16 @@ kernel stack.
 
 ==============================================================
 
+iso_cpu: (BFS CPU scheduler only).
+
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
+run effectively at realtime priority, averaged over a rolling five
+seconds over the -whole- system, meaning all cpus.
+
+Set to 70 (percent) by default.
+
+==============================================================
+
 l2cr: (PPC only)
 
 This flag controls the L2 cache of G3 processor boards. If
@@ -750,6 +762,20 @@ rebooting. ???
 
 ==============================================================
 
+rr_interval: (BFS CPU scheduler only)
+
+This is the smallest duration that any cpu process scheduling unit
+will run for. Increasing this value can increase throughput of cpu
+bound tasks substantially but at the expense of increased latencies
+overall. Conversely decreasing it will decrease average and maximum
+latencies but at the expense of throughput. This value is in
+milliseconds and the default value chosen depends on the number of
+cpus available at scheduler initialisation with a minimum of 6.
+
+Valid values are from 1-1000.
+
+==============================================================
+
 rtsig-max & rtsig-nr:
 
 The file rtsig-max can be used to tune the maximum number
@@ -760,6 +786,14 @@ rtsig-nr shows the number of RT signals currently queued.
 
 ==============================================================
 
+sched_schedstats:
+
+Enables/disables scheduler statistics. Enabling this feature
+incurs a small amount of overhead in the scheduler but is
+useful for debugging and performance tuning.
+
+==============================================================
+
 sg-big-buff:
 
 This file shows the size of the generic SCSI (sg) buffer.
diff --git b/Documentation/tp_smapi.txt b/Documentation/tp_smapi.txt
new file mode 100644
index 0000000..d037301
--- /dev/null
+++ b/Documentation/tp_smapi.txt
@@ -0,0 +1,267 @@
+tp_smapi version 0.40
+IBM ThinkPad hardware functions driver
+
+Author:  Shem Multinymous <multinymous@gmail.com>
+Project: http://sourceforge.net/projects/tpctl
+Wiki:    http://thinkwiki.org/wiki/tp_smapi
+List:    linux-thinkpad@linux-thinkpad.org
+         (http://mailman.linux-thinkpad.org/mailman/listinfo/linux-thinkpad)
+
+Description
+-----------
+
+ThinkPad laptops include a proprietary interface called SMAPI BIOS
+(System Management Application Program Interface) which provides some
+hardware control functionality that is not accessible by other means.
+
+This driver exposes some features of the SMAPI BIOS through a sysfs
+interface. It is suitable for newer models, on which SMAPI is invoked
+through IO port writes. Older models use a different SMAPI interface;
+for those, try the "thinkpad" module from the "tpctl" package.
+
+WARNING:
+This driver uses undocumented features and direct hardware access.
+It thus cannot be guaranteed to work, and may cause arbitrary damage
+(especially on models it wasn't tested on).
+
+
+Module parameters
+-----------------
+
+thinkpad_ec module:
+  force_io=1 lets thinkpad_ec load on some recent ThinkPad models
+  (e.g., T400 and T500) whose BIOS's ACPI DSDT reserves the ports we need.
+tp_smapi module:
+  debug=1    enables verbose dmesg output.
+
+
+Usage
+-----
+
+Control of battery charging thresholds (in percents of current full charge
+capacity):
+
+# echo 40 > /sys/devices/platform/smapi/BAT0/start_charge_thresh
+# echo 70 > /sys/devices/platform/smapi/BAT0/stop_charge_thresh
+# cat /sys/devices/platform/smapi/BAT0/*_charge_thresh
+
+    (This is useful since Li-Ion batteries wear out much faster at very
+     high or low charge levels. The driver will also keeps the thresholds
+     across suspend-to-disk with AC disconnected; this isn't done
+     automatically by the hardware.)
+
+Inhibiting battery charging for 17 minutes (overrides thresholds):
+
+# echo 17 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes
+# echo 0  > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes  # stop
+# cat /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes
+
+    (This can be used to control which battery is charged when using an
+     Ultrabay battery.)
+
+Forcing battery discharging even if AC power available:
+
+# echo 1 > /sys/devices/platform/smapi/BAT0/force_discharge  # start discharge
+# echo 0 > /sys/devices/platform/smapi/BAT0/force_discharge  # stop discharge
+# cat /sys/devices/platform/smapi/BAT0/force_discharge
+
+    (When AC is connected, forced discharging will automatically stop
+     when battery is fully depleted -- this is useful for calibration.
+     Also, this attribute can be used to control which battery is discharged
+     when both a system battery and an Ultrabay battery are connected.)
+
+Misc read-only battery status attributes (see note about HDAPS below):
+
+/sys/devices/platform/smapi/BAT0/installed   # 0 or 1
+/sys/devices/platform/smapi/BAT0/state       # idle/charging/discharging
+/sys/devices/platform/smapi/BAT0/cycle_count # integer counter
+/sys/devices/platform/smapi/BAT0/current_now # instantaneous current
+/sys/devices/platform/smapi/BAT0/current_avg # last minute average
+/sys/devices/platform/smapi/BAT0/power_now   # instantaneous power
+/sys/devices/platform/smapi/BAT0/power_avg   # last minute average
+/sys/devices/platform/smapi/BAT0/last_full_capacity         # in mWh
+/sys/devices/platform/smapi/BAT0/remaining_percent          # remaining percent of energy (set by calibration)
+/sys/devices/platform/smapi/BAT0/remaining_percent_error    # error range of remaing_percent (not reset by calibration)
+/sys/devices/platform/smapi/BAT0/remaining_running_time     # in minutes, by last minute average power
+/sys/devices/platform/smapi/BAT0/remaining_running_time_now # in minutes, by instantenous power
+/sys/devices/platform/smapi/BAT0/remaining_charging_time    # in minutes
+/sys/devices/platform/smapi/BAT0/remaining_capacity         # in mWh
+/sys/devices/platform/smapi/BAT0/design_capacity            # in mWh
+/sys/devices/platform/smapi/BAT0/voltage           # in mV
+/sys/devices/platform/smapi/BAT0/design_voltage    # in mV
+/sys/devices/platform/smapi/BAT0/charging_max_current  # max charging current
+/sys/devices/platform/smapi/BAT0/charging_max_voltage  # max charging voltage
+/sys/devices/platform/smapi/BAT0/group{0,1,2,3}_voltage # see below
+/sys/devices/platform/smapi/BAT0/manufacturer      # string
+/sys/devices/platform/smapi/BAT0/model             # string
+/sys/devices/platform/smapi/BAT0/barcoding         # string
+/sys/devices/platform/smapi/BAT0/chemistry         # string
+/sys/devices/platform/smapi/BAT0/serial            # integer
+/sys/devices/platform/smapi/BAT0/manufacture_date  # YYYY-MM-DD
+/sys/devices/platform/smapi/BAT0/first_use_date    # YYYY-MM-DD
+/sys/devices/platform/smapi/BAT0/temperature  # in milli-Celsius
+/sys/devices/platform/smapi/BAT0/dump         # see below
+/sys/devices/platform/smapi/ac_connected      # 0 or 1
+
+The BAT0/group{0,1,2,3}_voltage attribute refers to the separate cell groups
+in each battery. For example, on the ThinkPad 600, X3x, T4x and R5x models,
+the battery contains 3 cell groups in series, where each group consisting of 2
+or 3 cells  connected in parallel. The voltage of each group is given by these
+attributes, and their sum (roughly) equals the "voltage" attribute.
+(The effective performance of the battery is determined by the weakest group,
+i.e., the one those voltage changes most rapidly during dis/charging.)
+
+The "BAT0/dump" attribute gives a a hex dump of the raw status data, which
+contains additional data now in the above (if you can figure it out). Some
+unused values are autodetected and replaced by "--":
+
+In all of the above, replace BAT0 with BAT1 to address the 2nd battery (e.g.
+in the UltraBay).
+
+
+Raw SMAPI calls:
+
+/sys/devices/platform/smapi/smapi_request
+This performs raw SMAPI calls. It uses a bad interface that cannot handle
+multiple simultaneous access. Don't touch it, it's for development only.
+If you did touch it, you would so something like
+# echo '211a 100 0 0' > /sys/devices/platform/smapi/smapi_request
+# cat /sys/devices/platform/smapi/smapi_request
+and notice that in the output "211a 34b b2 0 0 0 'OK'", the "4b" in the 2nd
+value, converted to decimal is 75: the current charge stop threshold.
+
+
+Model-specific status
+---------------------
+
+Works (at least partially) on the following ThinkPad model:
+* A30
+* G41
+* R40, R50p, R51, R52
+* T23, T40, T40p, T41, T41p, T42, T42p, T43, T43p, T60
+* X24, X31, X32, X40, X41, X60
+* Z60t, Z61m
+
+Not all functions are available on all models; for detailed status, see:
+  http://thinkwiki.org/wiki/tp_smapi
+
+Please report success/failure by e-mail or on the Wiki.
+If you get a "not implemented" or "not supported" message, your laptop
+probably just can't do that (at least not via the SMAPI BIOS).
+For negative reports, follow the bug reporting guidelines below.
+If you send me the necessary technical data (i.e., SMAPI function
+interfaces), I will support additional models.
+
+
+Additional HDAPS features
+-------------------------
+
+The modified hdaps driver has several improvements on the one in mainline
+(beyond resolving the conflict with thinkpad_ec and tp_smapi):
+
+- Fixes reliability and improves support for recent ThinkPad models
+  (especially *60 and newer). Unlike the mainline driver, the modified hdaps
+  correctly follows the Embedded Controller communication protocol.
+
+- Extends the "invert" parameter to cover all possible axis orientations.
+  The possible values are as follows.
+  Let X,Y denote the hardware readouts.
+  Let R denote the laptop's roll (tilt left/right).
+  Let P denote the laptop's pitch (tilt forward/backward).
+    invert=0:   R= X  P= Y   (same as mainline)
+    invert=1:   R=-X  P=-Y   (same as mainline)
+    invert=2:   R=-X  P= Y   (new)
+    invert=3:   R= X  P=-Y   (new)
+    invert=4:   R= Y  P= X   (new)
+    invert=5:   R=-Y  P=-X   (new)
+    invert=6:   R=-Y  P= X   (new)
+    invert=7:   R= Y  P=-X   (new)
+  It's probably easiest to just try all 8 possibilities and see which yields
+  correct results (e.g., in the hdaps-gl visualisation).
+
+- Adds a whitelist which automatically sets the correct axis orientation for
+  some models. If the value for your model is wrong or missing, you can override
+  it using the "invert" parameter. Please also update the tables at
+  http://www.thinkwiki.org/wiki/tp_smapi and
+  http://www.thinkwiki.org/wiki/List_of_DMI_IDs
+  and submit a patch for the whitelist in hdaps.c.
+
+- Provides new attributes:
+  /sys/devices/platform/hdaps/sampling_rate:
+    This determines the frequency at which the host queries the embedded
+    controller for accelerometer data (and informs the hdaps input devices).
+    Default=50.
+  /sys/devices/platform/hdaps/oversampling_ratio:
+    When set to X, the embedded controller is told to do physical accelerometer
+    measurements at a rate that is X times higher than the rate at which
+    the driver reads those measurements (i.e., X*sampling_rate). This
+    makes the readouts from the embedded controller more fresh, and is also
+    useful for the running average filter (see next). Default=5
+  /sys/devices/platform/hdaps/running_avg_filter_order:
+    When set to X, reported readouts will be the average of the last X physical
+    accelerometer measurements. Current firmware allows 1<=X<=8. Setting to a
+    high value decreases readout fluctuations. The averaging is handled by the
+    embedded controller, so no CPU resources are used. Higher values make the
+    readouts smoother, since it averages out both sensor noise (good) and abrupt
+    changes (bad). Default=2.
+
+- Provides a second input device, which publishes the raw accelerometer
+  measurements (without the fuzzing needed for joystick emulation). This input
+  device can be matched by a udev rule such as the following (all on one line):
+    KERNEL=="event[0-9]*", ATTRS{phys}=="hdaps/input1",
+    ATTRS{modalias}=="input:b0019v1014p5054e4801-*",
+    SYMLINK+="input/hdaps/accelerometer-event
+
+A new version of the hdapsd userspace daemon, which uses the input device
+interface instead of polling sysfs, is available seprately. Using this reduces
+the total interrupts per second generated by hdaps+hdapsd (on tickless kernels)
+to 50, down from a value that fluctuates between 50 and 100. Set the
+sampling_rate sysfs attribute to a lower value to further reduce interrupts,
+at the expense of response latency.
+
+Licensing note: all my changes to the HDAPS driver are licensed under the
+GPL version 2 or, at your option and to the extent allowed by derivation from
+prior works, any later version. My version of hdaps is derived work from the
+mainline version, which at the time of writing is available only under
+GPL version 2.
+
+Bug reporting
+-------------
+
+Mail <multinymous@gmail.com>. Please include:
+* Details about your model,
+* Relevant "dmesg" output. Make sure thinkpad_ec and tp_smapi are loaded with
+  the "debug=1" parameter (e.g., use "make load HDAPS=1 DEBUG=1").
+* Output of "dmidecode | grep -C5 Product"
+* Does the failed functionality works under Windows?
+
+
+More about SMAPI
+----------------
+
+For hints about what may be possible via the SMAPI BIOS and how, see:
+
+* IBM Technical Reference Manual for the ThinkPad 770
+  (http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD)
+* Exported symbols in PWRMGRIF.DLL or TPPWRW32.DLL (e.g., use "objdump -x").
+* drivers/char/mwave/smapi.c in the Linux kernel tree.*
+* The "thinkpad" SMAPI module (http://tpctl.sourceforge.net).
+* The SMAPI_* constants in tp_smapi.c.
+
+Note that in the above Technical Reference and in the "thinkpad" module,
+SMAPI is invoked through a function call to some physical address. However,
+the interface used by tp_smapi and the above mwave drive, and apparently
+required by newer ThinkPad, is different: you set the parameters up in the
+CPU's registers and write to ports 0xB2 (the APM control port) and 0x4F; this
+triggers an SMI (System Management Interrupt), causing the CPU to enter
+SMM (System Management Mode) and run the BIOS firmware; the results are
+returned in the CPU's registers. It is not clear what is the relation between
+the two variants of SMAPI, though the assignment of error codes seems to be
+similar.
+
+In addition, the embedded controller on ThinkPad laptops has a non-standard
+interface at IO ports 0x1600-0x161F (mapped to LCP channel 3 of the H8S chip).
+The interface provides various system management services (currently known:
+battery information and accelerometer readouts). For more information see the
+thinkpad_ec module and the H8S hardware documentation:
+http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf
diff --git a/MAINTAINERS b/MAINTAINERS
index 77e4c10..533e7e5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2082,6 +2082,19 @@ F:	include/linux/audit.h
 F:	include/uapi/linux/audit.h
 F:	kernel/audit*
 
+AUFS (advanced multi layered unification filesystem) FILESYSTEM
+M:	"J. R. Okajima" <hooanon05g@gmail.com>
+L:	linux-unionfs@vger.kernel.org
+L:	aufs-users@lists.sourceforge.net (members only)
+W:	http://aufs.sourceforge.net
+T:	git://github.com/sfjro/aufs4-linux.git
+S:	Supported
+F:	Documentation/filesystems/aufs/
+F:	Documentation/ABI/testing/debugfs-aufs
+F:	Documentation/ABI/testing/sysfs-aufs
+F:	fs/aufs/
+F:	include/uapi/linux/aufs_type.h
+
 AUXILIARY DISPLAY DRIVERS
 M:	Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
 W:	http://miguelojeda.es/auxdisplay.htm
@@ -11181,6 +11194,13 @@ S:	Maintained
 F:	drivers/tc/
 F:	include/linux/tc.h
 
+TUXONICE (ENHANCED HIBERNATION)
+P:	Nigel Cunningham
+M:	nigel@nigelcunningham.com.au
+L:	tuxonice-devel@tuxonice.net
+W:	http://tuxonice.net
+S:	Maintained
+
 U14-34F SCSI DRIVER
 M:	Dario Ballabio <ballabio_dario@emc.com>
 L:	linux-scsi@vger.kernel.org
diff --git a/Makefile b/Makefile
index 00ce08d..8084251 100644
--- a/Makefile
+++ b/Makefile
@@ -615,6 +615,8 @@ KBUILD_CFLAGS	+= $(call cc-option,-fno-delete-null-pointer-checks,)
 
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS	+= -Os $(call cc-disable-warning,maybe-uninitialized,)
+else ifdef CONFIG_CC_OPTIMIZE_HARDER
+KBUILD_CFLAGS	+= -O3 $(call cc-disable-warning,maybe-uninitialized,)
 else
 KBUILD_CFLAGS	+= -O2
 endif
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index dda1959..08e49c4 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -506,18 +506,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
 	struct kvm_vcpu *vcpu;
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
-		wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+		struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 
 		vcpu->arch.pause = false;
-		wake_up_interruptible(wq);
+		swake_up(wq);
 	}
 }
 
 static void vcpu_sleep(struct kvm_vcpu *vcpu)
 {
-	wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 
-	wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
+	swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
 				       (!vcpu->arch.pause)));
 }
 
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index a9b3b90..c2b1315 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 {
 	struct kvm *kvm = source_vcpu->kvm;
 	struct kvm_vcpu *vcpu = NULL;
-	wait_queue_head_t *wq;
+	struct swait_queue_head *wq;
 	unsigned long cpu_id;
 	unsigned long context_id;
 	phys_addr_t target_pc;
@@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 	smp_mb();		/* Make sure the above is visible */
 
 	wq = kvm_arch_vcpu_wq(vcpu);
-	wake_up_interruptible(wq);
+	swake_up(wq);
 
 	return PSCI_RET_SUCCESS;
 }
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 3110447..70ef1a4 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -445,8 +445,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 
 	dvcpu->arch.wait = 0;
 
-	if (waitqueue_active(&dvcpu->wq))
-		wake_up_interruptible(&dvcpu->wq);
+	if (swait_active(&dvcpu->wq))
+		swake_up(&dvcpu->wq);
 
 	return 0;
 }
@@ -1174,8 +1174,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
 	kvm_mips_callbacks->queue_timer_int(vcpu);
 
 	vcpu->arch.wait = 0;
-	if (waitqueue_active(&vcpu->wq))
-		wake_up_interruptible(&vcpu->wq);
+	if (swait_active(&vcpu->wq))
+		swake_up(&vcpu->wq);
 }
 
 /* low level hrtimer wake routine */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 9d08d8c..c98afa5 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -289,7 +289,7 @@ struct kvmppc_vcore {
 	struct list_head runnable_threads;
 	struct list_head preempt_list;
 	spinlock_t lock;
-	wait_queue_head_t wq;
+	struct swait_queue_head wq;
 	spinlock_t stoltb_lock;	/* protects stolen_tb and preempt_tb */
 	u64 stolen_tb;
 	u64 preempt_tb;
@@ -629,7 +629,7 @@ struct kvm_vcpu_arch {
 	u8 prodded;
 	u32 last_inst;
 
-	wait_queue_head_t *wqp;
+	struct swait_queue_head *wqp;
 	struct kvmppc_vcore *vcore;
 	int ret;
 	int trap;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index baeddb0..f1187bb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
 	int cpu;
-	wait_queue_head_t *wqp;
+	struct swait_queue_head *wqp;
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (waitqueue_active(wqp)) {
-		wake_up_interruptible(wqp);
+	if (swait_active(wqp)) {
+		swake_up(wqp);
 		++vcpu->stat.halt_wakeup;
 	}
 
@@ -701,8 +701,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		tvcpu->arch.prodded = 1;
 		smp_mb();
 		if (vcpu->arch.ceded) {
-			if (waitqueue_active(&vcpu->wq)) {
-				wake_up_interruptible(&vcpu->wq);
+			if (swait_active(&vcpu->wq)) {
+				swake_up(&vcpu->wq);
 				vcpu->stat.halt_wakeup++;
 			}
 		}
@@ -1459,7 +1459,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 	INIT_LIST_HEAD(&vcore->runnable_threads);
 	spin_lock_init(&vcore->lock);
 	spin_lock_init(&vcore->stoltb_lock);
-	init_waitqueue_head(&vcore->wq);
+	init_swait_queue_head(&vcore->wq);
 	vcore->preempt_tb = TB_NIL;
 	vcore->lpcr = kvm->arch.lpcr;
 	vcore->first_vcpuid = core * threads_per_subcore;
@@ -2531,10 +2531,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
 	struct kvm_vcpu *vcpu;
 	int do_sleep = 1;
+	DECLARE_SWAITQUEUE(wait);
 
-	DEFINE_WAIT(wait);
-
-	prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
 
 	/*
 	 * Check one last time for pending exceptions and ceded state after
@@ -2548,7 +2547,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	}
 
 	if (!do_sleep) {
-		finish_wait(&vc->wq, &wait);
+		finish_swait(&vc->wq, &wait);
 		return;
 	}
 
@@ -2556,7 +2555,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	trace_kvmppc_vcore_blocked(vc, 0);
 	spin_unlock(&vc->lock);
 	schedule();
-	finish_wait(&vc->wq, &wait);
+	finish_swait(&vc->wq, &wait);
 	spin_lock(&vc->lock);
 	vc->vcore_state = VCORE_INACTIVE;
 	trace_kvmppc_vcore_blocked(vc, 1);
@@ -2612,7 +2611,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			kvmppc_start_thread(vcpu, vc);
 			trace_kvm_guest_enter(vcpu);
 		} else if (vc->vcore_state == VCORE_SLEEPING) {
-			wake_up(&vc->wq);
+			swake_up(&vc->wq);
 		}
 
 	}
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 998f632..5c80a0f 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -64,11 +64,6 @@ static struct timer_list spusched_timer;
 static struct timer_list spuloadavg_timer;
 
 /*
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
- */
-#define NORMAL_PRIO		120
-
-/*
  * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
  * tick for every 10 CPU scheduler ticks.
  */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8959ebb..b0c8ad0 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -467,7 +467,7 @@ struct kvm_s390_irq_payload {
 struct kvm_s390_local_interrupt {
 	spinlock_t lock;
 	struct kvm_s390_float_interrupt *float_int;
-	wait_queue_head_t *wq;
+	struct swait_queue_head *wq;
 	atomic_t *cpuflags;
 	DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
 	struct kvm_s390_irq_payload irq;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f88ca72..9ffc732 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -966,13 +966,13 @@ no_timer:
 
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 {
-	if (waitqueue_active(&vcpu->wq)) {
+	if (swait_active(&vcpu->wq)) {
 		/*
 		 * The vcpu gave up the cpu voluntarily, mark it as a good
 		 * yield-candidate.
 		 */
 		vcpu->preempted = true;
-		wake_up_interruptible(&vcpu->wq);
+		swake_up(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
 	}
 }
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3bf45a0..b971ea4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -911,10 +911,26 @@ config SCHED_SMT
 	depends on SMP
 	---help---
 	  SMT scheduler support improves the CPU scheduler's decision making
-	  when dealing with Intel Pentium 4 chips with HyperThreading at a
+	  when dealing with Intel P4/Core 2 chips with HyperThreading at a
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SMT_NICE
+	bool "SMT (Hyperthreading) aware nice priority and policy support"
+	depends on SCHED_BFS && SCHED_SMT
+	default y
+	---help---
+	  Enabling Hyperthreading on Intel CPUs decreases the effectiveness
+	  of the use of 'nice' levels and different scheduling policies
+	  (e.g. realtime) due to sharing of CPU power between hyperthreads.
+	  SMT nice support makes each logical CPU aware of what is running on
+	  its hyperthread siblings, maintaining appropriate distribution of
+	  CPU according to nice levels and scheduling policies at the expense
+	  of slightly increased overhead.
+
+	  If unsure say Y here.
+
+
 config SCHED_MC
 	def_bool y
 	prompt "Multi-core scheduler support"
@@ -1996,7 +2012,7 @@ config HOTPLUG_CPU
 config BOOTPARAM_HOTPLUG_CPU0
 	bool "Set default setting of cpu0_hotpluggable"
 	default n
-	depends on HOTPLUG_CPU
+	depends on HOTPLUG_CPU && !SCHED_BFS
 	---help---
 	  Set whether default state of cpu0_hotpluggable is on or off.
 
@@ -2025,7 +2041,7 @@ config BOOTPARAM_HOTPLUG_CPU0
 config DEBUG_HOTPLUG_CPU0
 	def_bool n
 	prompt "Debug CPU0 hotplug"
-	depends on HOTPLUG_CPU
+	depends on HOTPLUG_CPU && !SCHED_BFS
 	---help---
 	  Enabling this option offlines CPU0 (if CPU0 can be offlined) as
 	  soon as possible and boots up userspace with CPU0 offlined. User
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 3ba5ff2..ee3e7d4 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -147,9 +147,8 @@ config MPENTIUM4
 		-Paxville
 		-Dempsey
 
-
 config MK6
-	bool "K6/K6-II/K6-III"
+	bool "AMD K6/K6-II/K6-III"
 	depends on X86_32
 	---help---
 	  Select this for an AMD K6-family processor.  Enables use of
@@ -157,7 +156,7 @@ config MK6
 	  flags to GCC.
 
 config MK7
-	bool "Athlon/Duron/K7"
+	bool "AMD Athlon/Duron/K7"
 	depends on X86_32
 	---help---
 	  Select this for an AMD Athlon K7-family processor.  Enables use of
@@ -165,12 +164,69 @@ config MK7
 	  flags to GCC.
 
 config MK8
-	bool "Opteron/Athlon64/Hammer/K8"
+	bool "AMD Opteron/Athlon64/Hammer/K8"
 	---help---
 	  Select this for an AMD Opteron or Athlon64 Hammer-family processor.
 	  Enables use of some extended instructions, and passes appropriate
 	  optimization flags to GCC.
 
+config MK8SSE3
+	bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
+	---help---
+	  Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MK10
+	bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
+	---help---
+	  Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
+		Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MBARCELONA
+	bool "AMD Barcelona"
+	---help---
+	  Select this for AMD Barcelona and newer processors.
+
+	  Enables -march=barcelona
+
+config MBOBCAT
+	bool "AMD Bobcat"
+	---help---
+	  Select this for AMD Bobcat processors.
+
+	  Enables -march=btver1
+
+config MBULLDOZER
+	bool "AMD Bulldozer"
+	---help---
+	  Select this for AMD Bulldozer processors.
+
+	  Enables -march=bdver1
+
+config MPILEDRIVER
+	bool "AMD Piledriver"
+	---help---
+	  Select this for AMD Piledriver processors.
+
+	  Enables -march=bdver2
+
+config MSTEAMROLLER
+	bool "AMD Steamroller"
+	---help---
+	  Select this for AMD Steamroller processors.
+
+	  Enables -march=bdver3
+
+config MJAGUAR
+	bool "AMD Jaguar"
+	---help---
+	  Select this for AMD Jaguar processors.
+
+	  Enables -march=btver2
+
 config MCRUSOE
 	bool "Crusoe"
 	depends on X86_32
@@ -261,8 +317,17 @@ config MPSC
 	  using the cpu family field
 	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
 
+config MATOM
+	bool "Intel Atom"
+	---help---
+
+	  Select this for the Intel Atom platform. Intel Atom CPUs have an
+	  in-order pipelining architecture and thus can benefit from
+	  accordingly optimized code. Use a recent GCC with specific Atom
+	  support in order to fully benefit from selecting this option.
+
 config MCORE2
-	bool "Core 2/newer Xeon"
+	bool "Intel Core 2"
 	---help---
 
 	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
@@ -270,14 +335,71 @@ config MCORE2
 	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15
 	  (not a typo)
 
-config MATOM
-	bool "Intel Atom"
+	  Enables -march=core2
+
+config MNEHALEM
+	bool "Intel Nehalem"
 	---help---
 
-	  Select this for the Intel Atom platform. Intel Atom CPUs have an
-	  in-order pipelining architecture and thus can benefit from
-	  accordingly optimized code. Use a recent GCC with specific Atom
-	  support in order to fully benefit from selecting this option.
+	  Select this for 1st Gen Core processors in the Nehalem family.
+
+	  Enables -march=nehalem
+
+config MWESTMERE
+	bool "Intel Westmere"
+	---help---
+
+	  Select this for the Intel Westmere formerly Nehalem-C family.
+
+	  Enables -march=westmere
+
+config MSILVERMONT
+	bool "Intel Silvermont"
+	---help---
+
+	  Select this for the Intel Silvermont platform.
+
+	  Enables -march=silvermont
+
+config MSANDYBRIDGE
+	bool "Intel Sandy Bridge"
+	---help---
+
+	  Select this for 2nd Gen Core processors in the Sandy Bridge family.
+
+	  Enables -march=sandybridge
+
+config MIVYBRIDGE
+	bool "Intel Ivy Bridge"
+	---help---
+
+	  Select this for 3rd Gen Core processors in the Ivy Bridge family.
+
+	  Enables -march=ivybridge
+
+config MHASWELL
+	bool "Intel Haswell"
+	---help---
+
+	  Select this for 4th Gen Core processors in the Haswell family.
+
+	  Enables -march=haswell
+
+config MBROADWELL
+	bool "Intel Broadwell"
+	---help---
+
+	  Select this for 5th Gen Core processors in the Broadwell family.
+
+	  Enables -march=broadwell
+
+config MSKYLAKE
+	bool "Intel Skylake"
+	---help---
+
+	  Select this for 6th Gen Core processors in the Skylake family.
+
+	  Enables -march=skylake
 
 config GENERIC_CPU
 	bool "Generic-x86-64"
@@ -286,6 +408,19 @@ config GENERIC_CPU
 	  Generic x86-64 CPU.
 	  Run equally well on all x86-64 CPUs.
 
+config MNATIVE
+ bool "Native optimizations autodetected by GCC"
+ ---help---
+
+   GCC 4.2 and above support -march=native, which automatically detects
+   the optimum settings to use based on your processor. -march=native 
+   also detects and applies additional settings beyond -march specific
+   to your CPU, (eg. -msse4). Unless you have a specific reason not to
+   (e.g. distcc cross-compiling), you should probably be using
+   -march=native rather than anything listed below.
+
+   Enables -march=native
+
 endchoice
 
 config X86_GENERIC
@@ -300,6 +435,16 @@ config X86_GENERIC
 	  This is really intended for distributors who need more
 	  generic optimizations.
 
+config X86_MARCH_NATIVE
+	bool "Use -march=native cflag (EXPERIMENTAL)"
+	help
+	  Setting Y here, will result in passing the -march=native and
+	  -mtune=native cflags to GCC while compiling the kernel, which
+	  makes GCC check the CPU capabilities and use the best cflags
+	  for your computer.
+
+	  Set Y here only if you use >=gcc-4.2.0.
+
 #
 # Define implied options from the CPU selection here
 config X86_INTERNODE_CACHE_SHIFT
@@ -310,7 +455,7 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
-	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+	default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
 	default "4" if MELAN || M486 || MGEODEGX1
 	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
 
@@ -341,11 +486,11 @@ config X86_ALIGNMENT_16
 
 config X86_INTEL_USERCOPY
 	def_bool y
-	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE
 
 config X86_USE_PPRO_CHECKSUM
 	def_bool y
-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MATOM || MNATIVE
 
 config X86_USE_3DNOW
 	def_bool y
@@ -369,17 +514,17 @@ config X86_P6_NOP
 
 config X86_TSC
 	def_bool y
-	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
+	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM) || X86_64
 
 config X86_CMPXCHG64
 	def_bool y
-	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
+	depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE
 
 # this should be set for all -march=.. options where the compiler
 # generates cmov.
 config X86_CMOV
 	def_bool y
-	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
+	depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX)
 
 config X86_MINIMUM_CPU_FAMILY
 	int
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 4086abc..f728a7c 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -104,13 +104,38 @@ else
 	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
 
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
+        cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
         cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
+        cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8)
+        cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10)
+        cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona)
+        cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1)
+        cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1)
+        cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2)
+        cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3)
+        cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2)
         cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
 
         cflags-$(CONFIG_MCORE2) += \
-                $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
-	cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
-		$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
+                $(call cc-option,-march=core2,$(call cc-option,-mtune=core2))
+        cflags-$(CONFIG_MNEHALEM) += \
+                $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem))
+        cflags-$(CONFIG_MWESTMERE) += \
+                $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere))
+        cflags-$(CONFIG_MSILVERMONT) += \
+                $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont))
+        cflags-$(CONFIG_MSANDYBRIDGE) += \
+                $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge))
+        cflags-$(CONFIG_MIVYBRIDGE) += \
+                $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge))
+        cflags-$(CONFIG_MHASWELL) += \
+                $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell))
+        cflags-$(CONFIG_MBROADWELL) += \
+                $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell))
+        cflags-$(CONFIG_MSKYLAKE) += \
+                $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake))
+        cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \
+                $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic))
         cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
         KBUILD_CFLAGS += $(cflags-y)
 
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 6647ed4..6b71290 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -10,6 +10,9 @@ tune		= $(call cc-option,-mcpu=$(1),$(2))
 endif
 
 align := $(cc-option-align)
+ifeq ($(CONFIG_X86_MARCH_NATIVE),y)
+cflags-y			+= -march=native
+else
 cflags-$(CONFIG_M486)		+= -march=i486
 cflags-$(CONFIG_M586)		+= -march=i586
 cflags-$(CONFIG_M586TSC)	+= -march=i586
@@ -23,7 +26,16 @@ cflags-$(CONFIG_MK6)		+= -march=k6
 # Please note, that patches that add -march=athlon-xp and friends are pointless.
 # They make zero difference whatsosever to performance at this time.
 cflags-$(CONFIG_MK7)		+= -march=athlon
+cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
 cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,-march=athlon)
+cflags-$(CONFIG_MK8SSE3)		+= $(call cc-option,-march=k8-sse3,-march=athlon)
+cflags-$(CONFIG_MK10)	+= $(call cc-option,-march=amdfam10,-march=athlon)
+cflags-$(CONFIG_MBARCELONA)	+= $(call cc-option,-march=barcelona,-march=athlon)
+cflags-$(CONFIG_MBOBCAT)	+= $(call cc-option,-march=btver1,-march=athlon)
+cflags-$(CONFIG_MBULLDOZER)	+= $(call cc-option,-march=bdver1,-march=athlon)
+cflags-$(CONFIG_MPILEDRIVER)	+= $(call cc-option,-march=bdver2,-march=athlon)
+cflags-$(CONFIG_MSTEAMROLLER)	+= $(call cc-option,-march=bdver3,-march=athlon)
+cflags-$(CONFIG_MJAGUAR)	+= $(call cc-option,-march=btver2,-march=athlon)
 cflags-$(CONFIG_MCRUSOE)	+= -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MEFFICEON)	+= -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MWINCHIPC6)	+= $(call cc-option,-march=winchip-c6,-march=i586)
@@ -32,8 +44,16 @@ cflags-$(CONFIG_MCYRIXIII)	+= $(call cc-option,-march=c3,-march=i486) $(align)-f
 cflags-$(CONFIG_MVIAC3_2)	+= $(call cc-option,-march=c3-2,-march=i686)
 cflags-$(CONFIG_MVIAC7)		+= -march=i686
 cflags-$(CONFIG_MCORE2)		+= -march=i686 $(call tune,core2)
-cflags-$(CONFIG_MATOM)		+= $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \
-	$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
+cflags-$(CONFIG_MNEHALEM)	+= -march=i686 $(call tune,nehalem)
+cflags-$(CONFIG_MWESTMERE)	+= -march=i686 $(call tune,westmere)
+cflags-$(CONFIG_MSILVERMONT)	+= -march=i686 $(call tune,silvermont)
+cflags-$(CONFIG_MSANDYBRIDGE)	+= -march=i686 $(call tune,sandybridge)
+cflags-$(CONFIG_MIVYBRIDGE)	+= -march=i686 $(call tune,ivybridge)
+cflags-$(CONFIG_MHASWELL)	+= -march=i686 $(call tune,haswell)
+cflags-$(CONFIG_MBROADWELL)	+= -march=i686 $(call tune,broadwell)
+cflags-$(CONFIG_MSKYLAKE)	+= -march=i686 $(call tune,skylake)
+cflags-$(CONFIG_MATOM)		+= $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \
+	$(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic))
 
 # AMD Elan support
 cflags-$(CONFIG_MELAN)		+= -march=i486
@@ -41,6 +61,8 @@ cflags-$(CONFIG_MELAN)		+= -march=i486
 # Geode GX1 support
 cflags-$(CONFIG_MGEODEGX1)	+= -march=pentium-mmx
 cflags-$(CONFIG_MGEODE_LX)	+= $(call cc-option,-march=geode,-march=pentium-mmx)
+endif
+
 # add at the end to overwrite eventual tuning options from earlier
 # cpu entries
 cflags-$(CONFIG_X86_GENERIC) 	+= $(call tune,generic,$(call tune,i686))
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e3b7819..90f3c76 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -15,6 +15,24 @@
 #define MODULE_PROC_FAMILY "586MMX "
 #elif defined CONFIG_MCORE2
 #define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_MNATIVE
+#define MODULE_PROC_FAMILY "NATIVE "
+#elif defined CONFIG_MNEHALEM
+#define MODULE_PROC_FAMILY "NEHALEM "
+#elif defined CONFIG_MWESTMERE
+#define MODULE_PROC_FAMILY "WESTMERE "
+#elif defined CONFIG_MSILVERMONT
+#define MODULE_PROC_FAMILY "SILVERMONT "
+#elif defined CONFIG_MSANDYBRIDGE
+#define MODULE_PROC_FAMILY "SANDYBRIDGE "
+#elif defined CONFIG_MIVYBRIDGE
+#define MODULE_PROC_FAMILY "IVYBRIDGE "
+#elif defined CONFIG_MHASWELL
+#define MODULE_PROC_FAMILY "HASWELL "
+#elif defined CONFIG_MBROADWELL
+#define MODULE_PROC_FAMILY "BROADWELL "
+#elif defined CONFIG_MSKYLAKE
+#define MODULE_PROC_FAMILY "SKYLAKE "
 #elif defined CONFIG_MATOM
 #define MODULE_PROC_FAMILY "ATOM "
 #elif defined CONFIG_M686
@@ -33,6 +51,22 @@
 #define MODULE_PROC_FAMILY "K7 "
 #elif defined CONFIG_MK8
 #define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_MK8SSE3
+#define MODULE_PROC_FAMILY "K8SSE3 "
+#elif defined CONFIG_MK10
+#define MODULE_PROC_FAMILY "K10 "
+#elif defined CONFIG_MBARCELONA
+#define MODULE_PROC_FAMILY "BARCELONA "
+#elif defined CONFIG_MBOBCAT
+#define MODULE_PROC_FAMILY "BOBCAT "
+#elif defined CONFIG_MBULLDOZER
+#define MODULE_PROC_FAMILY "BULLDOZER "
+#elif defined CONFIG_MPILEDRIVER
+#define MODULE_PROC_FAMILY "STEAMROLLER "
+#elif defined CONFIG_MSTEAMROLLER
+#define MODULE_PROC_FAMILY "PILEDRIVER "
+#elif defined CONFIG_MJAGUAR
+#define MODULE_PROC_FAMILY "JAGUAR "
 #elif defined CONFIG_MELAN
 #define MODULE_PROC_FAMILY "ELAN "
 #elif defined CONFIG_MCRUSOE
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 21bf924..1fbf5af 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -27,7 +27,8 @@
 static int max_ypos = 25, max_xpos = 80;
 static int current_ypos = 25, current_xpos;
 
-static void early_vga_write(struct console *con, const char *str, unsigned n)
+static void early_vga_write(struct console *con, const char *str, unsigned n,
+                            unsigned int loglevel)
 {
 	char c;
 	int  i, k, j;
@@ -118,7 +119,8 @@ static int early_serial_putc(unsigned char ch)
 	return timeout ? 0 : -1;
 }
 
-static void early_serial_write(struct console *con, const char *s, unsigned n)
+static void early_serial_write(struct console *con, const char *s, unsigned n,
+                               unsigned int loglevel)
 {
 	while (*s && n-- > 0) {
 		if (*s == '\n')
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 4d38416..a457bce 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -173,6 +173,7 @@ void init_espfix_ap(int cpu)
 		struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
 
 		pmd_p = (pmd_t *)page_address(page);
+                SetPageTOI_Untracked(virt_to_page(pmd_p));
 		pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
 		paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
 		for (n = 0; n < ESPFIX_PUD_CLONES; n++)
@@ -185,6 +186,7 @@ void init_espfix_ap(int cpu)
 		struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
 
 		pte_p = (pte_t *)page_address(page);
+                SetPageTOI_Untracked(virt_to_page(pte_p));
 		pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
 		paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
 		for (n = 0; n < ESPFIX_PMD_CLONES; n++)
@@ -193,6 +195,7 @@ void init_espfix_ap(int cpu)
 
 	pte_p = pte_offset_kernel(&pmd, addr);
 	stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
+        SetPageTOI_Untracked(virt_to_page(stack_page));
 	pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
 	for (n = 0; n < ESPFIX_PTE_CLONES; n++)
 		set_pte(&pte_p[n*PTE_STRIDE], pte);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 589b319..435466f 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -28,8 +28,18 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 
 	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
 		return -EINVAL;
+#ifdef CONFIG_SCHED_BFS_AUTOISO
+	if (turn_on) {
+		struct sched_param param = { .sched_priority = 0 };
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+		/* Start X as SCHED_ISO */
+		sched_setscheduler_nocheck(current, SCHED_ISO, &param);
+	}
+#else
 	if (turn_on && !capable(CAP_SYS_RAWIO))
 		return -EPERM;
+#endif
 
 	/*
 	 * If it's the first ioperm() call in this thread's lifetime, set the
@@ -108,8 +118,15 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
 		return -EINVAL;
 	/* Trying to gain more privileges? */
 	if (level > old) {
+#ifdef CONFIG_SCHED_BFS_AUTOISO
+		struct sched_param param = { .sched_priority = 0 };
+#endif
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
+#ifdef CONFIG_SCHED_BFS_AUTOISO
+		/* Start X as SCHED_ISO */
+		sched_setscheduler_nocheck(current, SCHED_ISO, &param);
+#endif
 	}
 	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
 		(level << X86_EFLAGS_IOPL_BIT);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3d743da..4b20bb7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -12,6 +12,7 @@
 #include <linux/percpu.h>
 #include <linux/timex.h>
 #include <linux/static_key.h>
+#include <linux/mm.h>
 
 #include <asm/hpet.h>
 #include <asm/timer.h>
@@ -195,6 +196,10 @@ static void cyc2ns_init(int cpu)
 
 	c2n->head = c2n->data;
 	c2n->tail = c2n->data;
+
+        // Don't let TuxOnIce make data RO - a secondary CPU will cause a triple fault
+        // if it loads microcode, which then does a printk, which may end up invoking cycles_2_ns
+        SetPageTOI_Untracked(virt_to_page(c2n));
 }
 
 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36591fa..3a045f3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
 static void apic_timer_expired(struct kvm_lapic *apic)
 {
 	struct kvm_vcpu *vcpu = apic->vcpu;
-	wait_queue_head_t *q = &vcpu->wq;
+	struct swait_queue_head *q = &vcpu->wq;
 	struct kvm_timer *ktimer = &apic->lapic_timer;
 
 	if (atomic_read(&apic->lapic_timer.pending))
@@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 	atomic_inc(&apic->lapic_timer.pending);
 	kvm_set_pending_timer(vcpu);
 
-	if (waitqueue_active(q))
-		wake_up_interruptible(q);
+	if (swait_active(q))
+		swake_up(q);
 
 	if (apic_lvtt_tscdeadline(apic))
 		ktimer->expired_tscdeadline = ktimer->tscdeadline;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e830c71..8d2472b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,6 +13,7 @@
 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
 #include <linux/prefetch.h>		/* prefetchw			*/
 #include <linux/context_tracking.h>	/* exception_enter(), ...	*/
+#include <linux/tuxonice.h>             /* incremental image support    */
 #include <linux/uaccess.h>		/* faulthandler_disabled()	*/
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
@@ -662,6 +663,10 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	unsigned long flags;
 	int sig;
 
+        if (toi_make_writable(init_mm.pgd, address)) {
+            return;
+        }
+
 	/* Are we prepared to handle this kernel fault? */
 	if (fixup_exception(regs)) {
 		/*
@@ -916,10 +921,101 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	}
 }
 
+#ifdef CONFIG_TOI_INCREMENTAL
+/**
+ * _toi_do_cbw - Do a copy-before-write before letting the faulting process continue
+ */
+static void toi_do_cbw(struct page *page)
+{
+    struct toi_cbw_state *state = this_cpu_ptr(&toi_cbw_states);
+
+    state->active = 1;
+    wmb();
+
+    if (state->enabled && state->next && PageTOI_CBW(page)) {
+        struct toi_cbw *this = state->next;
+        memcpy(this->virt, page_address(page), PAGE_SIZE);
+        this->pfn = page_to_pfn(page);
+        state->next = this->next;
+    }
+
+    state->active = 0;
+}
+
+/**
+ * _toi_make_writable - Defuse TOI's write protection
+ */
+int _toi_make_writable(pte_t *pte)
+{
+    struct page *page = pte_page(*pte);
+    if (PageTOI_RO(page)) {
+        pgd_t *pgd = __va(read_cr3());
+        /*
+         * If this is a TuxOnIce caused fault, we may not have permission to
+         * write to a page needed to reset the permissions of the original
+         * page. Use swapper_pg_dir to get around this.
+         */
+        load_cr3(swapper_pg_dir);
+
+        set_pte_atomic(pte, pte_mkwrite(*pte));
+        SetPageTOI_Dirty(page);
+        ClearPageTOI_RO(page);
+
+        toi_do_cbw(page);
+
+        load_cr3(pgd);
+        return 1;
+    }
+    return 0;
+}
+
+/**
+ * toi_make_writable - Handle a (potential) fault caused by TOI's write protection
+ *
+ * Make a page writable that was protected. Might be because of a fault, or
+ * because we're allocating it and want it to be untracked.
+ *
+ * Note that in the fault handling case, we don't care about the error code. If
+ * called from the double fault handler, we won't have one. We just check to
+ * see if the page was made RO by TOI, and mark it dirty/release the protection
+ * if it was.
+ */
+int toi_make_writable(pgd_t *pgd, unsigned long address)
+{
+    pud_t *pud;
+    pmd_t *pmd;
+    pte_t *pte;
+
+    pgd = pgd + pgd_index(address);
+    if (!pgd_present(*pgd))
+        return 0;
+
+    pud = pud_offset(pgd, address);
+    if (!pud_present(*pud))
+        return 0;
+
+    if (pud_large(*pud))
+        return _toi_make_writable((pte_t *) pud);
+
+    pmd = pmd_offset(pud, address);
+    if (!pmd_present(*pmd))
+        return 0;
+
+    if (pmd_large(*pmd))
+        return _toi_make_writable((pte_t *) pmd);
+
+    pte = pte_offset_kernel(pmd, address);
+    if (!pte_present(*pte))
+        return 0;
+
+    return _toi_make_writable(pte);
+}
+#endif
+
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 {
 	if ((error_code & PF_WRITE) && !pte_write(*pte))
-		return 0;
+                return 0;
 
 	if ((error_code & PF_INSTR) && !pte_exec(*pte))
 		return 0;
@@ -1082,6 +1178,15 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 		kmemcheck_hide(regs);
 	prefetchw(&mm->mmap_sem);
 
+        /*
+         * Detect and handle page faults due to TuxOnIce making pages read-only
+         * so that it can create incremental images.
+         *
+         * Do it early to avoid double faults.
+         */
+        if (unlikely(toi_make_writable(init_mm.pgd, address)))
+            return;
+
 	if (unlikely(kmmio_fault(regs, address)))
 		return;
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 493f541..695ac7d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -150,9 +150,10 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_TOI_INCREMENTAL)
 	/*
-	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+         * For CONFIG_DEBUG_PAGEALLOC or TuxOnIce's incremental image support,
+         * identity mapping will use small pages.
 	 * This will simplify cpa(), which otherwise needs to support splitting
 	 * large pages into small in interrupt context, etc.
 	 */
diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c
index 5241421..5eb772c 100644
--- a/arch/x86/platform/efi/early_printk.c
+++ b/arch/x86/platform/efi/early_printk.c
@@ -124,7 +124,8 @@ static void early_efi_write_char(u32 *dst, unsigned char c, unsigned int h)
 }
 
 static void
-early_efi_write(struct console *con, const char *str, unsigned int num)
+early_efi_write(struct console *con, const char *str, unsigned int num,
+		unsigned int loglevel)
 {
 	struct screen_info *si;
 	unsigned int len;
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9..1fc1a4d 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -39,9 +39,28 @@ config CFQ_GROUP_IOSCHED
 	---help---
 	  Enable group IO scheduling in CFQ.
 
+config IOSCHED_BFQ
+	tristate "BFQ I/O scheduler"
+	default y
+	---help---
+	  The BFQ I/O scheduler tries to distribute bandwidth among
+	  all processes according to their weights.
+	  It aims at distributing the bandwidth as desired, independently of
+	  the disk parameters and with any workload. It also tries to
+	  guarantee low latency to interactive and soft real-time
+	  applications. If compiled built-in (saying Y here), BFQ can
+	  be configured to support hierarchical scheduling.
+
+config BFQ_GROUP_IOSCHED
+	bool "BFQ hierarchical scheduling support"
+	depends on CGROUPS && IOSCHED_BFQ=y
+	default n
+	---help---
+	  Enable hierarchical scheduling in BFQ, using the blkio controller.
+
 choice
 	prompt "Default I/O scheduler"
-	default DEFAULT_CFQ
+	default DEFAULT_BFQ
 	help
 	  Select the I/O scheduler which will be used by default for all
 	  block devices.
@@ -52,6 +71,16 @@ choice
 	config DEFAULT_CFQ
 		bool "CFQ" if IOSCHED_CFQ=y
 
+	config DEFAULT_BFQ
+		bool "BFQ" if IOSCHED_BFQ=y
+		help
+		  Selects BFQ as the default I/O scheduler which will be
+		  used by default for all block devices.
+		  The BFQ I/O scheduler aims at distributing the bandwidth
+		  as desired, independently of the disk parameters and with
+		  any workload. It also tries to guarantee low latency to
+		  interactive and soft real-time applications.
+
 	config DEFAULT_NOOP
 		bool "No-op"
 
@@ -61,6 +90,7 @@ config DEFAULT_IOSCHED
 	string
 	default "deadline" if DEFAULT_DEADLINE
 	default "cfq" if DEFAULT_CFQ
+	default "bfq" if DEFAULT_BFQ
 	default "noop" if DEFAULT_NOOP
 
 endmenu
diff --git a/block/Makefile b/block/Makefile
index 9eda232..37afa48 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
 			blk-lib.o blk-mq.o blk-mq-tag.o \
 			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
-			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+			uuid.o genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
 			badblocks.o partitions/
 
 obj-$(CONFIG_BOUNCE)	+= bounce.o
@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git b/block/bfq-cgroup.c b/block/bfq-cgroup.c
new file mode 100644
index 0000000..5ee99ec
--- /dev/null
+++ b/block/bfq-cgroup.c
@@ -0,0 +1,1186 @@
+/*
+ * BFQ: CGROUPS support.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
+ * file.
+ */
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+	BFQG_stats_waiting = 0,
+	BFQG_stats_idling,
+	BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name)						\
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags |= (1 << BFQG_stats_##name);			\
+}									\
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags &= ~(1 << BFQG_stats_##name);			\
+}									\
+static int bfqg_stats_##name(struct bfqg_stats *stats)		\
+{									\
+	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
+}									\
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
+	bfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+						 struct bfq_group *curr_bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_waiting(stats))
+		return;
+	if (bfqg == curr_bfqg)
+		return;
+	stats->start_group_wait_time = sched_clock();
+	bfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_empty(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
+	bfqg_stats_clear_empty(stats);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+	blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (blkg_rwstat_total(&stats->queued))
+		return;
+
+	/*
+	 * group is already marked empty. This can happen if bfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
+	 */
+	if (bfqg_stats_empty(stats))
+		return;
+
+	stats->start_empty_time = sched_clock();
+	bfqg_stats_mark_empty(stats);
+}
+
+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_idling(stats)) {
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
+		bfqg_stats_clear_idling(stats);
+	}
+}
+
+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	stats->start_idle_time = sched_clock();
+	bfqg_stats_mark_idling(stats);
+}
+
+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_total(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	bfqg_stats_update_group_wait_time(stats);
+}
+
+static struct blkcg_policy blkcg_policy_bfq;
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+	return pd_to_blkg(&bfqg->pd);
+}
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+	struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);
+	BUG_ON(!pd);
+	return pd_to_bfqg(pd);
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+
+	return group_entity ? container_of(group_entity, struct bfq_group,
+					   entity) :
+			      bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+	return blkg_get(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_put(struct bfq_group *bfqg)
+{
+	return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+				     struct bfq_queue *bfqq,
+				     int rw)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, rw, 1);
+	bfqg_stats_end_empty_time(&bfqg->stats);
+	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, rw, -1);
+}
+
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw)
+{
+	blkg_rwstat_add(&bfqg->stats.merged, rw, 1);
+}
+
+static void bfqg_stats_update_dispatch(struct bfq_group *bfqg,
+					      uint64_t bytes, int rw)
+{
+	blkg_stat_add(&bfqg->stats.sectors, bytes >> 9);
+	blkg_rwstat_add(&bfqg->stats.serviced, rw, 1);
+	blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes);
+}
+
+static void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time, int rw)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+	unsigned long long now = sched_clock();
+
+	if (time_after64(now, io_start_time))
+		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
+	if (time_after64(io_start_time, start_time))
+		blkg_rwstat_add(&stats->wait_time, rw,
+				io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+	if (!stats)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_reset(&stats->service_bytes);
+	blkg_rwstat_reset(&stats->serviced);
+	blkg_rwstat_reset(&stats->merged);
+	blkg_rwstat_reset(&stats->service_time);
+	blkg_rwstat_reset(&stats->wait_time);
+	blkg_stat_reset(&stats->time);
+	blkg_stat_reset(&stats->unaccounted_time);
+	blkg_stat_reset(&stats->avg_queue_size_sum);
+	blkg_stat_reset(&stats->avg_queue_size_samples);
+	blkg_stat_reset(&stats->dequeue);
+	blkg_stat_reset(&stats->group_wait_time);
+	blkg_stat_reset(&stats->idle_time);
+	blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+	if (!to || !from)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);
+	blkg_rwstat_add_aux(&to->serviced, &from->serviced);
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
+	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+	struct bfq_group *parent;
+
+	if (!bfqg) /* root_group */
+		return;
+
+	parent = bfqg_parent(bfqg);
+
+	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+	if (unlikely(!parent))
+		return;
+
+	bfqg_stats_merge(&parent->dead_stats, &bfqg->stats);
+	bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats);
+	bfqg_stats_reset(&bfqg->stats);
+	bfqg_stats_reset(&bfqg->dead_stats);
+}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+		bfqg_get(bfqg);
+	}
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+	blkg_rwstat_exit(&stats->service_bytes);
+	blkg_rwstat_exit(&stats->serviced);
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	blkg_stat_exit(&stats->sectors);
+	blkg_stat_exit(&stats->time);
+	blkg_stat_exit(&stats->unaccounted_time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||
+	    blkg_rwstat_init(&stats->serviced, gfp) ||
+	    blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+	    blkg_stat_init(&stats->sectors, gfp) ||
+	    blkg_stat_init(&stats->time, gfp) ||
+	    blkg_stat_init(&stats->unaccounted_time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp)) {
+		bfqg_stats_exit(stats);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+ {
+	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+ }
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+static void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+	d->weight = BFQ_DEFAULT_GRP_WEIGHT;
+}
+
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+	struct bfq_group *bfqg;
+
+	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+	if (!bfqg)
+		return NULL;
+
+	if (bfqg_stats_init(&bfqg->stats, gfp) ||
+	    bfqg_stats_init(&bfqg->dead_stats, gfp)) {
+		kfree(bfqg);
+		return NULL;
+	}
+
+	return &bfqg->pd;
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
+{
+	struct bfq_entity *entity;
+
+	BUG_ON(!parent);
+	BUG_ON(!bfqg);
+	BUG_ON(bfqg == parent);
+
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
+}
+
+static void bfq_pd_init(struct blkg_policy_data *pd)
+{
+	struct blkcg_gq *blkg = pd_to_blkg(pd);
+	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
+	struct bfq_entity *entity = &bfqg->entity;
+	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+
+	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+	entity->my_sched_data = &bfqg->sched_data;
+	bfqg->my_entity = entity; /*
+				   * the root_group's will be set to NULL
+				   * in bfq_init_queue()
+				   */
+	bfqg->bfqd = bfqd;
+	bfqg->active_entities = 0;
+	bfqg->rq_pos_tree = RB_ROOT;
+}
+
+static void bfq_pd_free(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_exit(&bfqg->stats);
+	bfqg_stats_exit(&bfqg->dead_stats);
+
+	return kfree(bfqg);
+}
+
+/* offset delta from bfqg->stats to bfqg->dead_stats */
+static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) -
+					offsetof(struct bfq_group, stats);
+
+/* to be used by recursive prfill, sums live and dead stats recursively */
+static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+	u64 sum = 0;
+
+	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);
+	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,
+				       off + dead_stats_off_delta);
+	return sum;
+}
+
+/* to be used by recursive prfill, sums live and dead rwstats recursively */
+static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
+						       int off)
+{
+	struct blkg_rwstat a, b;
+
+	a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);
+	b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,
+				      off + dead_stats_off_delta);
+	blkg_rwstat_add_aux(&a, &b);
+	return a;
+}
+
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_reset(&bfqg->stats);
+	bfqg_stats_reset(&bfqg->dead_stats);
+}
+
+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
+					      struct blkcg *blkcg)
+{
+	struct request_queue *q = bfqd->queue;
+	struct bfq_group *bfqg = NULL, *parent;
+	struct bfq_entity *entity = NULL;
+
+	assert_spin_locked(bfqd->queue->queue_lock);
+
+	/* avoid lookup for the common case where there's no blkcg */
+	if (blkcg == &blkcg_root) {
+		bfqg = bfqd->root_group;
+	} else {
+		struct blkcg_gq *blkg;
+
+		blkg = blkg_lookup_create(blkcg, q);
+		if (!IS_ERR(blkg))
+			bfqg = blkg_to_bfqg(blkg);
+		else /* fallback to root_group */
+			bfqg = bfqd->root_group;
+	}
+
+	BUG_ON(!bfqg);
+
+	/*
+	 * Update chain of bfq_groups as we might be handling a leaf group
+	 * which, along with some of its relatives, has not been hooked yet
+	 * to the private hierarchy of BFQ.
+	 */
+	entity = &bfqg->entity;
+	for_each_entity(entity) {
+		bfqg = container_of(entity, struct bfq_group, entity);
+		BUG_ON(!bfqg);
+		if (bfqg != bfqd->root_group) {
+			parent = bfqg_parent(bfqg);
+			if (!parent)
+				parent = bfqd->root_group;
+			BUG_ON(!parent);
+			bfq_group_set_parent(bfqg, parent);
+		}
+	}
+
+	return bfqg;
+}
+
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @entity: @bfqq's entity.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one.  Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+	int busy, resume;
+
+	busy = bfq_bfqq_busy(bfqq);
+	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
+
+	BUG_ON(resume && !entity->on_st);
+	BUG_ON(busy && !resume && entity->on_st &&
+	       bfqq != bfqd->in_service_queue);
+
+	if (busy) {
+		BUG_ON(atomic_read(&bfqq->ref) < 2);
+
+		if (!resume)
+			bfq_del_bfqq_busy(bfqd, bfqq, 0);
+		else
+			bfq_deactivate_bfqq(bfqd, bfqq, 0);
+	} else if (entity->on_st)
+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	bfqg_put(bfqq_group(bfqq));
+
+	/*
+	 * Here we use a reference to bfqg.  We don't need a refcounter
+	 * as the cgroup reference will not be dropped, so that its
+	 * destroy() callback will not be invoked.
+	 */
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+	bfqg_get(bfqg);
+
+	if (busy) {
+		bfq_pos_tree_add_move(bfqd, bfqq);
+		if (resume)
+			bfq_activate_bfqq(bfqd, bfqq);
+	}
+
+	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+						struct bfq_io_cq *bic,
+						struct blkcg *blkcg)
+{
+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+	struct bfq_group *bfqg;
+	struct bfq_entity *entity;
+
+	lockdep_assert_held(bfqd->queue->queue_lock);
+
+	bfqg = bfq_find_alloc_group(bfqd, blkcg);
+	if (async_bfqq) {
+		entity = &async_bfqq->entity;
+
+		if (entity->sched_data != &bfqg->sched_data) {
+			bic_set_bfqq(bic, NULL, 0);
+			bfq_log_bfqq(bfqd, async_bfqq,
+				     "bic_change_group: %p %d",
+				     async_bfqq, atomic_read(&async_bfqq->ref));
+			bfq_put_queue(async_bfqq);
+		}
+	}
+
+	if (sync_bfqq) {
+		entity = &sync_bfqq->entity;
+		if (entity->sched_data != &bfqg->sched_data)
+			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
+	}
+
+	return bfqg;
+}
+
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	struct blkcg *blkcg;
+	struct bfq_group *bfqg = NULL;
+	uint64_t id;
+
+	rcu_read_lock();
+	blkcg = bio_blkcg(bio);
+	id = blkcg->css.serial_nr;
+	rcu_read_unlock();
+
+	/*
+	 * Check whether blkcg has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!bfqd) || likely(bic->blkcg_id == id))
+		return;
+
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg);
+	BUG_ON(!bfqg);
+	bic->blkcg_id = id;
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entity = st->first_idle;
+
+	for (; entity ; entity = st->first_idle)
+		__bfq_deactivate_entity(entity, 0);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+				     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	BUG_ON(!bfqq);
+	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
+	return;
+}
+
+/**
+ * bfq_reparent_active_entities - move to the root group all active
+ *                                entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
+ *
+ * Needs queue_lock to be taken and reference to be valid over the call.
+ */
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+					 struct bfq_group *bfqg,
+					 struct bfq_service_tree *st)
+{
+	struct rb_root *active = &st->active;
+	struct bfq_entity *entity = NULL;
+
+	if (!RB_EMPTY_ROOT(&st->active))
+		entity = bfq_entity_of(rb_first(active));
+
+	for (; entity ; entity = bfq_entity_of(rb_first(active)))
+		bfq_reparent_leaf_entity(bfqd, entity);
+
+	if (bfqg->sched_data.in_service_entity)
+		bfq_reparent_leaf_entity(bfqd,
+			bfqg->sched_data.in_service_entity);
+
+	return;
+}
+
+/**
+ * bfq_destroy_group - destroy @bfqg.
+ * @bfqg: the group being destroyed.
+ *
+ * Destroy @bfqg, making sure that it is not referenced from its parent.
+ * blkio already grabs the queue_lock for us, so no need to use RCU-based magic
+ */
+static void bfq_pd_offline(struct blkg_policy_data *pd)
+{
+	struct bfq_service_tree *st;
+	struct bfq_group *bfqg;
+	struct bfq_data *bfqd;
+	struct bfq_entity *entity;
+	int i;
+
+	BUG_ON(!pd);
+	bfqg = pd_to_bfqg(pd);
+	BUG_ON(!bfqg);
+	bfqd = bfqg->bfqd;
+	BUG_ON(bfqd && !bfqd->root_group);
+
+	entity = bfqg->my_entity;
+
+	if (!entity) /* root group */
+		return;
+
+	/*
+	 * Empty all service_trees belonging to this group before
+	 * deactivating the group itself.
+	 */
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+		BUG_ON(!bfqg->sched_data.service_tree);
+		st = bfqg->sched_data.service_tree + i;
+		/*
+		 * The idle tree may still contain bfq_queues belonging
+		 * to exited task because they never migrated to a different
+		 * cgroup from the one being destroyed now.  No one else
+		 * can access them so it's safe to act without any lock.
+		 */
+		bfq_flush_idle_tree(st);
+
+		/*
+		 * It may happen that some queues are still active
+		 * (busy) upon group destruction (if the corresponding
+		 * processes have been forced to terminate). We move
+		 * all the leaf entities corresponding to these queues
+		 * to the root_group.
+		 * Also, it may happen that the group has an entity
+		 * in service, which is disconnected from the active
+		 * tree: it must be moved, too.
+		 * There is no need to put the sync queues, as the
+		 * scheduler has taken no reference.
+		 */
+		bfq_reparent_active_entities(bfqd, bfqg, st);
+		BUG_ON(!RB_EMPTY_ROOT(&st->active));
+		BUG_ON(!RB_EMPTY_ROOT(&st->idle));
+	}
+	BUG_ON(bfqg->sched_data.next_in_service);
+	BUG_ON(bfqg->sched_data.in_service_entity);
+
+	__bfq_deactivate_entity(entity, 0);
+	bfq_put_async_queues(bfqd, bfqg);
+	BUG_ON(entity->tree);
+
+	bfqg_stats_xfer_dead(bfqg);
+}
+
+static void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+	struct blkcg_gq *blkg;
+
+	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+		bfq_end_wr_async_queues(bfqd, bfqg);
+	}
+	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css,
+				       struct cftype *cftype)
+{
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	int ret = -EINVAL;
+
+	spin_lock_irq(&blkcg->lock);
+	ret = bfqgd->weight;
+	spin_unlock_irq(&blkcg->lock);
+
+	return ret;
+}
+
+static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+
+	spin_lock_irq(&blkcg->lock);
+	seq_printf(sf, "%u\n", bfqgd->weight);
+	spin_unlock_irq(&blkcg->lock);
+
+	return 0;
+}
+
+static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,
+					struct cftype *cftype,
+					u64 val)
+{
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	struct blkcg_gq *blkg;
+	int ret = -EINVAL;
+
+	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+		return ret;
+
+	ret = 0;
+	spin_lock_irq(&blkcg->lock);
+	bfqgd->weight = (unsigned short)val;
+	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+		if (!bfqg)
+			continue;
+		/*
+		 * Setting the prio_changed flag of the entity
+		 * to 1 with new_weight == weight would re-set
+		 * the value of the weight to its ioprio mapping.
+		 * Set the flag only if necessary.
+		 */
+		if ((unsigned short)val != bfqg->entity.new_weight) {
+			bfqg->entity.new_weight = (unsigned short)val;
+			/*
+			 * Make sure that the above new value has been
+			 * stored in bfqg->entity.new_weight before
+			 * setting the prio_changed flag. In fact,
+			 * this flag may be read asynchronously (in
+			 * critical sections protected by a different
+			 * lock than that held here), and finding this
+			 * flag set may cause the execution of the code
+			 * for updating parameters whose value may
+			 * depend also on bfqg->entity.new_weight (in
+			 * __bfq_entity_update_weight_prio).
+			 * This barrier makes sure that the new value
+			 * of bfqg->entity.new_weight is correctly
+			 * seen in that code.
+			 */
+			smp_wmb();
+			bfqg->entity.prio_changed = 1;
+		}
+	}
+	spin_unlock_irq(&blkcg->lock);
+
+	return ret;
+}
+
+static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of,
+					     char *buf, size_t nbytes,
+					     loff_t off)
+{
+	/* First unsigned long found in the file is used */
+	return bfqio_cgroup_weight_write(of_css(of), NULL,
+					 simple_strtoull(strim(buf), NULL, 0));
+}
+
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
+	return 0;
+}
+
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
+	return 0;
+}
+
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 sum = bfqg_stat_pd_recursive_sum(pd, off);
+
+	return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+					struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off);
+
+	return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, false);
+	return 0;
+}
+
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, true);
+	return 0;
+}
+
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+	u64 v = 0;
+
+	if (samples) {
+		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+		v = div64_u64(v, samples);
+	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
+}
+
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+			  0, false);
+	return 0;
+}
+
+static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	int ret;
+
+	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+	if (ret)
+		return NULL;
+
+        return blkg_to_bfqg(bfqd->queue->root_blkg);
+}
+
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+        struct bfq_group_data *bgd;
+
+        bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
+        if (!bgd)
+                return NULL;
+        return &bgd->pd;
+}
+
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+        kfree(cpd_to_bfqgd(cpd));
+}
+
+static struct cftype bfqio_files_dfl[] = {
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfqio_cgroup_weight_read_dfl,
+		.write = bfqio_cgroup_weight_write_dfl,
+	},
+	{} /* terminate */
+};
+
+static struct cftype bfqio_files[] = {
+	{
+		.name = "bfq.weight",
+		.read_u64 = bfqio_cgroup_weight_read,
+		.write_u64 = bfqio_cgroup_weight_write,
+	},
+	/* statistics, cover only the tasks in the bfqg */
+	{
+		.name = "bfq.time",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.sectors",
+		.private = offsetof(struct bfq_group, stats.sectors),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.io_service_bytes",
+		.private = offsetof(struct bfq_group, stats.service_bytes),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_serviced",
+		.private = offsetof(struct bfq_group, stats.serviced),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_service_time",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_wait_time",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_merged",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_queued",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat,
+	},
+
+	/* the same statictics which cover the bfqg and its descendants */
+	{
+		.name = "bfq.time_recursive",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.sectors_recursive",
+		.private = offsetof(struct bfq_group, stats.sectors),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.io_service_bytes_recursive",
+		.private = offsetof(struct bfq_group, stats.service_bytes),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_serviced_recursive",
+		.private = offsetof(struct bfq_group, stats.serviced),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_service_time_recursive",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_wait_time_recursive",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_merged_recursive",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_queued_recursive",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.avg_queue_size",
+		.seq_show = bfqg_print_avg_queue_size,
+	},
+	{
+		.name = "bfq.group_wait_time",
+		.private = offsetof(struct bfq_group, stats.group_wait_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.idle_time",
+		.private = offsetof(struct bfq_group, stats.idle_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.empty_time",
+		.private = offsetof(struct bfq_group, stats.empty_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.dequeue",
+		.private = offsetof(struct bfq_group, stats.dequeue),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.unaccounted_time",
+		.private = offsetof(struct bfq_group, stats.unaccounted_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
+static struct blkcg_policy blkcg_policy_bfq = {
+       .dfl_cftypes            = bfqio_files_dfl,
+       .legacy_cftypes         = bfqio_files,
+
+       .pd_alloc_fn            = bfq_pd_alloc,
+       .pd_init_fn             = bfq_pd_init,
+       .pd_offline_fn          = bfq_pd_offline,
+       .pd_free_fn             = bfq_pd_free,
+       .pd_reset_stats_fn      = bfq_pd_reset_stats,
+
+       .cpd_alloc_fn           = bfq_cpd_alloc,
+       .cpd_init_fn            = bfq_cpd_init,
+       .cpd_bind_fn	       = bfq_cpd_init,
+       .cpd_free_fn            = bfq_cpd_free,
+
+};
+
+#else
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+	}
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static struct bfq_group *
+bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	return bfqd->root_group;
+}
+
+static void bfq_bfqq_move(struct bfq_data *bfqd,
+			  struct bfq_queue *bfqq,
+			  struct bfq_entity *entity,
+			  struct bfq_group *bfqg)
+{
+}
+
+static void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static void bfq_disconnect_groups(struct bfq_data *bfqd)
+{
+	bfq_put_async_queues(bfqd, bfqd->root_group);
+}
+
+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
+                                              struct blkcg *blkcg)
+{
+	return bfqd->root_group;
+}
+
+static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	struct bfq_group *bfqg;
+	int i;
+
+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+	if (!bfqg)
+		return NULL;
+
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+	return bfqg;
+}
+#endif
diff --git b/block/bfq-ioc.c b/block/bfq-ioc.c
new file mode 100644
index 0000000..fb7bb8f
--- /dev/null
+++ b/block/bfq-ioc.c
@@ -0,0 +1,36 @@
+/*
+ * BFQ: I/O context handling.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ */
+
+/**
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
+ * @icq: the iocontext queue.
+ */
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+{
+	/* bic->icq is the first member, %NULL will convert to %NULL */
+	return container_of(icq, struct bfq_io_cq, icq);
+}
+
+/**
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ *
+ * Queue lock must be held.
+ */
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+					struct io_context *ioc)
+{
+	if (ioc)
+		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
+	return NULL;
+}
diff --git b/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 0000000..d1f648d
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,4413 @@
+/*
+ * Budget Fair Queueing (BFQ) disk scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
+ * file.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based on
+ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
+ * measured in number of sectors, to processes instead of time slices. The
+ * device is not granted to the in-service process for a given time slice,
+ * but until it has exhausted its assigned budget. This change from the time
+ * to the service domain allows BFQ to distribute the device throughput
+ * among processes as desired, without any distortion due to ZBR, workload
+ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,
+ * called B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated to processes. Thanks to the
+ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to
+ * I/O-bound processes issuing sequential requests (to boost the
+ * throughput), and yet guarantee a low latency to interactive and soft
+ * real-time applications.
+ *
+ * BFQ is described in [1], where also a reference to the initial, more
+ * theoretical paper on BFQ can be found. The interested reader can find
+ * in the latter paper full details on the main algorithm, as well as
+ * formulas of the guarantees and formal proofs of all the properties.
+ * With respect to the version of BFQ presented in these papers, this
+ * implementation adds a few more heuristics, such as the one that
+ * guarantees a low latency to soft real-time applications, and a
+ * hierarchical extension based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
+ * complexity derives from the one introduced with EEVDF in [3].
+ *
+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
+ *     with the BFQ Disk I/O Scheduler'',
+ *     Proceedings of the 5th Annual International Systems and Storage
+ *     Conference (SYSTOR '12), June 2012.
+ *
+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
+ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
+ *     Oct 1997.
+ *
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
+ *     First: A Flexible and Accurate Mechanism for Proportional Share
+ *     Resource Allocation,'' technical report.
+ *
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include "bfq.h"
+#include "blk.h"
+
+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
+
+/* Maximum backwards seek, in KiB. */
+static const int bfq_back_max = 16 * 1024;
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
+
+/* Idling period duration, in jiffies. */
+static int bfq_slice_idle = HZ / 125;
+
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = 16 * 1024;
+static const int bfq_max_budget_async_rq = 4;
+
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multiplied by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+static const int bfq_timeout_sync = HZ / 8;
+static int bfq_timeout_async = HZ / 25;
+
+struct kmem_cache *bfq_pool;
+
+/* Below this threshold (in ms), we consider thinktime immediate. */
+#define BFQ_MIN_TT		2
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD	4
+#define BFQ_HW_QUEUE_SAMPLES	32
+
+#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
+
+/* Min samples used for peak rate estimation (for autotuning). */
+#define BFQ_PEAK_RATE_SAMPLES	32
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT		16
+
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ *   SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]
+ * are the reference values for a slow/fast rotational device, whereas
+ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for
+ * a slow/fast non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes.
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1536, 10752};
+static int R_fast[2] = {17415, 34791};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
+
+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])
+
+static void bfq_schedule_dispatch(struct bfq_data *bfqd);
+
+#include "bfq-ioc.c"
+#include "bfq-sched.c"
+#include "bfq-cgroup.c"
+
+#define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples)	((samples) > 80)
+
+/*
+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
+ * set (in which case it could also be a direct WRITE).
+ */
+static int bfq_bio_sync(struct bio *bio)
+{
+	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+static void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+	if (bfqd->queued != 0) {
+		bfq_log(bfqd, "schedule dispatch");
+		kblockd_schedule_work(&bfqd->unplug_work);
+	}
+}
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now.  Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+				      struct request *rq1,
+				      struct request *rq2,
+				      sector_t last)
+{
+	sector_t s1, s2, d1 = 0, d2 = 0;
+	unsigned long back_max;
+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
+	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
+
+	if (!rq1 || rq1 == rq2)
+		return rq2;
+	if (!rq2)
+		return rq1;
+
+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+		return rq1;
+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+		return rq2;
+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+		return rq1;
+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+		return rq2;
+
+	s1 = blk_rq_pos(rq1);
+	s2 = blk_rq_pos(rq2);
+
+	/*
+	 * By definition, 1KiB is 2 sectors.
+	 */
+	back_max = bfqd->bfq_back_max * 2;
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek.
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (s1 + back_max >= last)
+		d1 = (last - s1) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ1_WRAP;
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (s2 + back_max >= last)
+		d2 = (last - s2) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ2_WRAP;
+
+	/* Found required data */
+
+	/*
+	 * By doing switch() on the bit mask "wrap" we avoid having to
+	 * check two variables for all permutations: --> faster!
+	 */
+	switch (wrap) {
+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+		if (d1 < d2)
+			return rq1;
+		else if (d2 < d1)
+			return rq2;
+		else {
+			if (s1 >= s2)
+				return rq1;
+			else
+				return rq2;
+		}
+
+	case BFQ_RQ2_WRAP:
+		return rq1;
+	case BFQ_RQ1_WRAP:
+		return rq2;
+	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
+	default:
+		/*
+		 * Since both rqs are wrapped,
+		 * start with the one that's further behind head
+		 * (--> only *one* back seek required),
+		 * since back seek takes more time than forward.
+		 */
+		if (s1 <= s2)
+			return rq1;
+		else
+			return rq2;
+	}
+}
+
+static struct bfq_queue *
+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
+		     sector_t sector, struct rb_node **ret_parent,
+		     struct rb_node ***rb_link)
+{
+	struct rb_node **p, *parent;
+	struct bfq_queue *bfqq = NULL;
+
+	parent = NULL;
+	p = &root->rb_node;
+	while (*p) {
+		struct rb_node **n;
+
+		parent = *p;
+		bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+
+		/*
+		 * Sort strictly based on sector. Smallest to the left,
+		 * largest to the right.
+		 */
+		if (sector > blk_rq_pos(bfqq->next_rq))
+			n = &(*p)->rb_right;
+		else if (sector < blk_rq_pos(bfqq->next_rq))
+			n = &(*p)->rb_left;
+		else
+			break;
+		p = n;
+		bfqq = NULL;
+	}
+
+	*ret_parent = parent;
+	if (rb_link)
+		*rb_link = p;
+
+	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
+		(long long unsigned)sector,
+		bfqq ? bfqq->pid : 0);
+
+	return bfqq;
+}
+
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct rb_node **p, *parent;
+	struct bfq_queue *__bfqq;
+
+	if (bfqq->pos_root) {
+		rb_erase(&bfqq->pos_node, bfqq->pos_root);
+		bfqq->pos_root = NULL;
+	}
+
+	if (bfq_class_idle(bfqq))
+		return;
+	if (!bfqq->next_rq)
+		return;
+
+	bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
+			blk_rq_pos(bfqq->next_rq), &parent, &p);
+	if (!__bfqq) {
+		rb_link_node(&bfqq->pos_node, parent, p);
+		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
+	} else
+		bfqq->pos_root = NULL;
+}
+
+/*
+ * Tell whether there are active queues or groups with differentiated weights.
+ */
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+{
+	/*
+	 * For weights to differ, at least one of the trees must contain
+	 * at least two nodes.
+	 */
+	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+		(bfqd->queue_weights_tree.rb_node->rb_left ||
+		 bfqd->queue_weights_tree.rb_node->rb_right)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	       ) ||
+	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+		(bfqd->group_weights_tree.rb_node->rb_left ||
+		 bfqd->group_weights_tree.rb_node->rb_right)
+#endif
+	       );
+}
+
+/*
+ * The following function returns true if every queue must receive the
+ * same share of the throughput (this condition is used when deciding
+ * whether idling may be disabled, see the comments in the function
+ * bfq_bfqq_may_idle()).
+ *
+ * Such a scenario occurs when:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ *    number of children.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore this function evaluates, instead, the following stronger
+ * sub-conditions, for which it is much easier to maintain the needed
+ * state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, thus no state needs
+ * to be maintained in this case.
+ */
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+{
+	return
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		!bfqd->active_numerous_groups &&
+#endif
+		!bfq_differentiated_weights(bfqd);
+}
+
+/*
+ * If the weight-counter tree passed as input contains no counter for
+ * the weight of the input entity, then add that counter; otherwise just
+ * increment the existing counter.
+ *
+ * Note that weight-counter trees contain few nodes in mostly symmetric
+ * scenarios. For example, if all queues have the same weight, then the
+ * weight-counter tree for the queues may contain at most one node.
+ * This holds even if low_latency is on, because weight-raised queues
+ * are not inserted in the tree.
+ * In most scenarios, the rate at which nodes are created/destroyed
+ * should be low too.
+ */
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+				 struct bfq_entity *entity,
+				 struct rb_root *root)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/*
+	 * Do not insert if the entity is already associated with a
+	 * counter, which happens if:
+	 *   1) the entity is associated with a queue,
+	 *   2) a request arrival has caused the queue to become both
+	 *      non-weight-raised, and hence change its weight, and
+	 *      backlogged; in this respect, each of the two events
+	 *      causes an invocation of this function,
+	 *   3) this is the invocation of this function caused by the
+	 *      second event. This second invocation is actually useless,
+	 *      and we handle this fact by exiting immediately. More
+	 *      efficient or clearer solutions might possibly be adopted.
+	 */
+	if (entity->weight_counter)
+		return;
+
+	while (*new) {
+		struct bfq_weight_counter *__counter = container_of(*new,
+						struct bfq_weight_counter,
+						weights_node);
+		parent = *new;
+
+		if (entity->weight == __counter->weight) {
+			entity->weight_counter = __counter;
+			goto inc_counter;
+		}
+		if (entity->weight < __counter->weight)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+					 GFP_ATOMIC);
+	entity->weight_counter->weight = entity->weight;
+	rb_link_node(&entity->weight_counter->weights_node, parent, new);
+	rb_insert_color(&entity->weight_counter->weights_node, root);
+
+inc_counter:
+	entity->weight_counter->num_active++;
+}
+
+/*
+ * Decrement the weight counter associated with the entity, and, if the
+ * counter reaches 0, remove the counter from the tree.
+ * See the comments to the function bfq_weights_tree_add() for considerations
+ * about overhead.
+ */
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+				    struct bfq_entity *entity,
+				    struct rb_root *root)
+{
+	if (!entity->weight_counter)
+		return;
+
+	BUG_ON(RB_EMPTY_ROOT(root));
+	BUG_ON(entity->weight_counter->weight != entity->weight);
+
+	BUG_ON(!entity->weight_counter->num_active);
+	entity->weight_counter->num_active--;
+	if (entity->weight_counter->num_active > 0)
+		goto reset_entity_pointer;
+
+	rb_erase(&entity->weight_counter->weights_node, root);
+	kfree(entity->weight_counter);
+
+reset_entity_pointer:
+	entity->weight_counter = NULL;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq,
+					struct request *last)
+{
+	struct rb_node *rbnext = rb_next(&last->rb_node);
+	struct rb_node *rbprev = rb_prev(&last->rb_node);
+	struct request *next = NULL, *prev = NULL;
+
+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
+
+	if (rbprev)
+		prev = rb_entry_rq(rbprev);
+
+	if (rbnext)
+		next = rb_entry_rq(rbnext);
+	else {
+		rbnext = rb_first(&bfqq->sort_list);
+		if (rbnext && rbnext != &last->rb_node)
+			next = rb_entry_rq(rbnext);
+	}
+
+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+/* see the definition of bfq_async_charge_factor for details */
+static unsigned long bfq_serv_to_charge(struct request *rq,
+					struct bfq_queue *bfqq)
+{
+	return blk_rq_sectors(rq) *
+		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
+		bfq_async_charge_factor));
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown).  We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+				 struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	struct request *next_rq = bfqq->next_rq;
+	unsigned long new_budget;
+
+	if (!next_rq)
+		return;
+
+	if (bfqq == bfqd->in_service_queue)
+		/*
+		 * In order not to break guarantees, budgets cannot be
+		 * changed after an entity has been selected.
+		 */
+		return;
+
+	BUG_ON(entity->tree != &st->active);
+	BUG_ON(entity == entity->sched_data->in_service_entity);
+
+	new_budget = max_t(unsigned long, bfqq->max_budget,
+			   bfq_serv_to_charge(next_rq, bfqq));
+	if (entity->budget != new_budget) {
+		entity->budget = new_budget;
+		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
+					 new_budget);
+		bfq_activate_bfqq(bfqd, bfqq);
+	}
+}
+
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+	u64 dur;
+
+	if (bfqd->bfq_wr_max_time > 0)
+		return bfqd->bfq_wr_max_time;
+
+	dur = bfqd->RT_prod;
+	do_div(dur, bfqd->peak_rate);
+
+	return dur;
+}
+
+static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq)
+{
+	return bfqq->bic ? bfqq->bic->cooperations : 0;
+}
+
+static void
+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+	if (bic->saved_idle_window)
+		bfq_mark_bfqq_idle_window(bfqq);
+	else
+		bfq_clear_bfqq_idle_window(bfqq);
+	if (bic->saved_IO_bound)
+		bfq_mark_bfqq_IO_bound(bfqq);
+	else
+		bfq_clear_bfqq_IO_bound(bfqq);
+	/* Assuming that the flag in_large_burst is already correctly set */
+	if (bic->wr_time_left && bfqq->bfqd->low_latency &&
+	    !bfq_bfqq_in_large_burst(bfqq) &&
+	    bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {
+		/*
+		 * Start a weight raising period with the duration given by
+		 * the raising_time_left snapshot.
+		 */
+		if (bfq_bfqq_busy(bfqq))
+			bfqq->bfqd->wr_busy_queues++;
+		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+		bfqq->wr_cur_max_time = bic->wr_time_left;
+		bfqq->last_wr_start_finish = jiffies;
+		bfqq->entity.prio_changed = 1;
+	}
+	/*
+	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from
+	 * getting confused about the queue's need of a weight-raising
+	 * period.
+	 */
+	bic->wr_time_left = 0;
+}
+
+static int bfqq_process_refs(struct bfq_queue *bfqq)
+{
+	int process_refs, io_refs;
+
+	lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
+
+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
+	BUG_ON(process_refs < 0);
+	return process_refs;
+}
+
+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_queue *item;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+		hlist_del_init(&item->burst_list_node);
+	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+	bfqd->burst_size = 1;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	/* Increment burst size to take into account also bfqq */
+	bfqd->burst_size++;
+
+	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+		struct bfq_queue *pos, *bfqq_item;
+		struct hlist_node *n;
+
+		/*
+		 * Enough queues have been activated shortly after each
+		 * other to consider this burst as large.
+		 */
+		bfqd->large_burst = true;
+
+		/*
+		 * We can now mark all queues in the burst list as
+		 * belonging to a large burst.
+		 */
+		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+				     burst_list_node)
+		        bfq_mark_bfqq_in_large_burst(bfqq_item);
+		bfq_mark_bfqq_in_large_burst(bfqq);
+
+		/*
+		 * From now on, and until the current burst finishes, any
+		 * new queue being activated shortly after the last queue
+		 * was inserted in the burst can be immediately marked as
+		 * belonging to a large burst. So the burst list is not
+		 * needed any more. Remove it.
+		 */
+		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+					  burst_list_node)
+			hlist_del_init(&pos->burst_list_node);
+	} else /* burst not yet large: add bfqq to the burst list */
+		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues happen to become active shortly after each other, then,
+ * to help the processes associated to these queues get their job done as
+ * soon as possible, it is usually better to not grant either weight-raising
+ * or device idling to these queues. In this comment we describe, firstly,
+ * the reasons why this fact holds, and, secondly, the next function, which
+ * implements the main steps needed to properly mark these queues so that
+ * they can then be treated in a different way.
+ *
+ * As for the terminology, we say that a queue becomes active, i.e.,
+ * switches from idle to backlogged, either when it is created (as a
+ * consequence of the arrival of an I/O request), or, if already existing,
+ * when a new request for the queue arrives while the queue is idle.
+ * Bursts of activations, i.e., activations of different queues occurring
+ * shortly after each other, are typically caused by services or applications
+ * that spawn or reactivate many parallel threads/processes. Examples are
+ * systemd during boot or git grep.
+ *
+ * These services or applications benefit mostly from a high throughput:
+ * the quicker the requests of the activated queues are cumulatively served,
+ * the sooner the target job of these queues gets completed. As a consequence,
+ * weight-raising any of these queues, which also implies idling the device
+ * for it, is almost always counterproductive: in most cases it just lowers
+ * throughput.
+ *
+ * On the other hand, a burst of activations may be also caused by the start
+ * of an application that does not consist in a lot of parallel I/O-bound
+ * threads. In fact, with a complex application, the burst may be just a
+ * consequence of the fact that several processes need to be executed to
+ * start-up the application. To start an application as quickly as possible,
+ * the best thing to do is to privilege the I/O related to the application
+ * with respect to all other I/O. Therefore, the best strategy to start as
+ * quickly as possible an application that causes a burst of activations is
+ * to weight-raise all the queues activated during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the two
+ * types of bursts need to be distinguished. Fortunately, this seems
+ * relatively easy to do, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that bursts with a larger size
+ * than that threshold are apparently caused only by services or commands
+ * such as systemd or git grep. For brevity, hereafter we call just 'large'
+ * these bursts. BFQ *does not* weight-raise queues whose activations occur
+ * in a large burst. In addition, for each of these queues BFQ performs or
+ * does not perform idling depending on which choice boosts the throughput
+ * most. The exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Turning back to the next function, it implements all the steps needed
+ * to detect the occurrence of a large burst and to properly mark all the
+ * queues belonging to it (so that they can then be treated in a different
+ * way). This goal is achieved by maintaining a special "burst list" that
+ * holds, temporarily, the queues that belong to the burst in progress. The
+ * list is then used to mark these queues as belonging to a large burst if
+ * the burst does become large. The main steps are the following.
+ *
+ * . when the very first queue is activated, the queue is inserted into the
+ *   list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ *   not yet belong to the burst is activated shortly after the last time
+ *   at which a new queue entered the burst list, then the function appends
+ *   Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ *   the large-burst threshold, then
+ *
+ *     . all the queues in the burst list are marked as belonging to a
+ *       large burst
+ *
+ *     . the burst list is deleted; in fact, the burst list already served
+ *       its purpose (keeping temporarily track of the queues in a burst,
+ *       so as to be able to mark them as belonging to a large burst in the
+ *       previous sub-step), and now is not needed any more
+ *
+ *     . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is activated while
+ *   the device is in large-burst mode and shortly after the last time
+ *   at which a queue either entered the burst list or was marked as
+ *   belonging to the current large burst, then Q is immediately marked
+ *   as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is activated a while
+ *   later, i.e., not shortly after, than the last time at which a queue
+ *   either entered the burst list or was marked as belonging to the
+ *   current large burst, then the current burst is deemed as finished and:
+ *
+ *        . the large-burst mode is reset if set
+ *
+ *        . the burst list is emptied
+ *
+ *        . Q is inserted in the burst list, as Q may be the first queue
+ *          in a possible new burst (then the burst list contains just Q
+ *          after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			     bool idle_for_long_time)
+{
+	/*
+	 * If bfqq happened to be activated in a burst, but has been idle
+	 * for at least as long as an interactive queue, then we assume
+	 * that, in the overall I/O initiated in the burst, the I/O
+	 * associated to bfqq is finished. So bfqq does not need to be
+	 * treated as a queue belonging to a burst anymore. Accordingly,
+	 * we reset bfqq's in_large_burst flag if set, and remove bfqq
+	 * from the burst list if it's there. We do not decrement instead
+	 * burst_size, because the fact that bfqq does not need to belong
+	 * to the burst list any more does not invalidate the fact that
+	 * bfqq may have been activated during the current burst.
+	 */
+	if (idle_for_long_time) {
+		hlist_del_init(&bfqq->burst_list_node);
+		bfq_clear_bfqq_in_large_burst(bfqq);
+	}
+
+	/*
+	 * If bfqq is already in the burst list or is part of a large
+	 * burst, then there is nothing else to do.
+	 */
+	if (!hlist_unhashed(&bfqq->burst_list_node) ||
+	    bfq_bfqq_in_large_burst(bfqq))
+		return;
+
+	/*
+	 * If bfqq's activation happens late enough, then the current
+	 * burst is finished, and related data structures must be reset.
+	 *
+	 * In this respect, consider the special case where bfqq is the very
+	 * first queue being activated. In this case, last_ins_in_burst is
+	 * not yet significant when we get here. But it is easy to verify
+	 * that, whether or not the following condition is true, bfqq will
+	 * end up being inserted into the burst list. In particular the
+	 * list will happen to contain only bfqq. And this is exactly what
+	 * has to happen, as bfqq may be the first queue in a possible
+	 * burst.
+	 */
+	if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+	    bfqd->bfq_burst_interval)) {
+		bfqd->large_burst = false;
+		bfq_reset_burst_list(bfqd, bfqq);
+		return;
+	}
+
+	/*
+	 * If we get here, then bfqq is being activated shortly after the
+	 * last queue. So, if the current burst is also large, we can mark
+	 * bfqq as belonging to this large burst immediately.
+	 */
+	if (bfqd->large_burst) {
+		bfq_mark_bfqq_in_large_burst(bfqq);
+		return;
+	}
+
+	/*
+	 * If we get here, then a large-burst state has not yet been
+	 * reached, but bfqq is being activated shortly after the last
+	 * queue. Then we add bfqq to the burst.
+	 */
+	bfq_add_to_burst(bfqd, bfqq);
+}
+
+static void bfq_add_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_data *bfqd = bfqq->bfqd;
+	struct request *next_rq, *prev;
+	unsigned long old_wr_coeff = bfqq->wr_coeff;
+	bool interactive = false;
+
+	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
+	bfqq->queued[rq_is_sync(rq)]++;
+	bfqd->queued++;
+
+	elv_rb_add(&bfqq->sort_list, rq);
+
+	/*
+	 * Check if this request is a better next-serve candidate.
+	 */
+	prev = bfqq->next_rq;
+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+	BUG_ON(!next_rq);
+	bfqq->next_rq = next_rq;
+
+	/*
+	 * Adjust priority tree position, if next_rq changes.
+	 */
+	if (prev != bfqq->next_rq)
+		bfq_pos_tree_add_move(bfqd, bfqq);
+
+	if (!bfq_bfqq_busy(bfqq)) {
+		bool soft_rt, coop_or_in_burst,
+		     idle_for_long_time = time_is_before_jiffies(
+						bfqq->budget_timeout +
+						bfqd->bfq_wr_min_idle_time);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
+					 rq->cmd_flags);
+#endif
+		if (bfq_bfqq_sync(bfqq)) {
+			bool already_in_burst =
+			   !hlist_unhashed(&bfqq->burst_list_node) ||
+			   bfq_bfqq_in_large_burst(bfqq);
+			bfq_handle_burst(bfqd, bfqq, idle_for_long_time);
+			/*
+			 * If bfqq was not already in the current burst,
+			 * then, at this point, bfqq either has been
+			 * added to the current burst or has caused the
+			 * current burst to terminate. In particular, in
+			 * the second case, bfqq has become the first
+			 * queue in a possible new burst.
+			 * In both cases last_ins_in_burst needs to be
+			 * moved forward.
+			 */
+			if (!already_in_burst)
+				bfqd->last_ins_in_burst = jiffies;
+		}
+
+		coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||
+			bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;
+		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+			!coop_or_in_burst &&
+			time_is_before_jiffies(bfqq->soft_rt_next_start);
+		interactive = !coop_or_in_burst && idle_for_long_time;
+		entity->budget = max_t(unsigned long, bfqq->max_budget,
+				       bfq_serv_to_charge(next_rq, bfqq));
+
+		if (!bfq_bfqq_IO_bound(bfqq)) {
+			if (time_before(jiffies,
+					RQ_BIC(rq)->ttime.last_end_request +
+					bfqd->bfq_slice_idle)) {
+				bfqq->requests_within_timer++;
+				if (bfqq->requests_within_timer >=
+				    bfqd->bfq_requests_within_timer)
+					bfq_mark_bfqq_IO_bound(bfqq);
+			} else
+				bfqq->requests_within_timer = 0;
+		}
+
+		if (!bfqd->low_latency)
+			goto add_bfqq_busy;
+
+		if (bfq_bfqq_just_split(bfqq))
+			goto set_prio_changed;
+
+		/*
+		 * If the queue:
+		 * - is not being boosted,
+		 * - has been idle for enough time,
+		 * - is not a sync queue or is linked to a bfq_io_cq (it is
+		 *   shared "for its nature" or it is not shared and its
+		 *   requests have not been redirected to a shared queue)
+		 * start a weight-raising period.
+		 */
+		if (old_wr_coeff == 1 && (interactive || soft_rt) &&
+		    (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			if (interactive)
+				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+			else
+				bfqq->wr_cur_max_time =
+					bfqd->bfq_wr_rt_max_time;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "wrais starting at %lu, rais_max_time %u",
+				     jiffies,
+				     jiffies_to_msecs(bfqq->wr_cur_max_time));
+		} else if (old_wr_coeff > 1) {
+			if (interactive)
+				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+			else if (coop_or_in_burst ||
+				 (bfqq->wr_cur_max_time ==
+				  bfqd->bfq_wr_rt_max_time &&
+				  !soft_rt)) {
+				bfqq->wr_coeff = 1;
+				bfq_log_bfqq(bfqd, bfqq,
+					"wrais ending at %lu, rais_max_time %u",
+					jiffies,
+					jiffies_to_msecs(bfqq->
+						wr_cur_max_time));
+			} else if (time_before(
+					bfqq->last_wr_start_finish +
+					bfqq->wr_cur_max_time,
+					jiffies +
+					bfqd->bfq_wr_rt_max_time) &&
+				   soft_rt) {
+				/*
+				 *
+				 * The remaining weight-raising time is lower
+				 * than bfqd->bfq_wr_rt_max_time, which means
+				 * that the application is enjoying weight
+				 * raising either because deemed soft-rt in
+				 * the near past, or because deemed interactive
+				 * a long ago.
+				 * In both cases, resetting now the current
+				 * remaining weight-raising time for the
+				 * application to the weight-raising duration
+				 * for soft rt applications would not cause any
+				 * latency increase for the application (as the
+				 * new duration would be higher than the
+				 * remaining time).
+				 *
+				 * In addition, the application is now meeting
+				 * the requirements for being deemed soft rt.
+				 * In the end we can correctly and safely
+				 * (re)charge the weight-raising duration for
+				 * the application with the weight-raising
+				 * duration for soft rt applications.
+				 *
+				 * In particular, doing this recharge now, i.e.,
+				 * before the weight-raising period for the
+				 * application finishes, reduces the probability
+				 * of the following negative scenario:
+				 * 1) the weight of a soft rt application is
+				 *    raised at startup (as for any newly
+				 *    created application),
+				 * 2) since the application is not interactive,
+				 *    at a certain time weight-raising is
+				 *    stopped for the application,
+				 * 3) at that time the application happens to
+				 *    still have pending requests, and hence
+				 *    is destined to not have a chance to be
+				 *    deemed soft rt before these requests are
+				 *    completed (see the comments to the
+				 *    function bfq_bfqq_softrt_next_start()
+				 *    for details on soft rt detection),
+				 * 4) these pending requests experience a high
+				 *    latency because the application is not
+				 *    weight-raised while they are pending.
+				 */
+				bfqq->last_wr_start_finish = jiffies;
+				bfqq->wr_cur_max_time =
+					bfqd->bfq_wr_rt_max_time;
+			}
+		}
+set_prio_changed:
+		if (old_wr_coeff != bfqq->wr_coeff)
+			entity->prio_changed = 1;
+add_bfqq_busy:
+		bfqq->last_idle_bklogged = jiffies;
+		bfqq->service_from_backlogged = 0;
+		bfq_clear_bfqq_softrt_update(bfqq);
+		bfq_add_bfqq_busy(bfqd, bfqq);
+	} else {
+		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
+		    time_is_before_jiffies(
+				bfqq->last_wr_start_finish +
+				bfqd->bfq_wr_min_inter_arr_async)) {
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+
+			bfqd->wr_busy_queues++;
+			entity->prio_changed = 1;
+			bfq_log_bfqq(bfqd, bfqq,
+			    "non-idle wrais starting at %lu, rais_max_time %u",
+			    jiffies,
+			    jiffies_to_msecs(bfqq->wr_cur_max_time));
+		}
+		if (prev != bfqq->next_rq)
+			bfq_updated_next_req(bfqd, bfqq);
+	}
+
+	if (bfqd->low_latency &&
+		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
+		bfqq->last_wr_start_finish = jiffies;
+}
+
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
+					  struct bio *bio)
+{
+	struct task_struct *tsk = current;
+	struct bfq_io_cq *bic;
+	struct bfq_queue *bfqq;
+
+	bic = bfq_bic_lookup(bfqd, tsk->io_context);
+	if (!bic)
+		return NULL;
+
+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
+	if (bfqq)
+		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
+
+	return NULL;
+}
+
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+
+	bfqd->rq_in_driver++;
+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
+		(long long unsigned)bfqd->last_position);
+}
+
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+
+	BUG_ON(bfqd->rq_in_driver == 0);
+	bfqd->rq_in_driver--;
+}
+
+static void bfq_remove_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	const int sync = rq_is_sync(rq);
+
+	if (bfqq->next_rq == rq) {
+		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+		bfq_updated_next_req(bfqd, bfqq);
+	}
+
+	if (rq->queuelist.prev != &rq->queuelist)
+		list_del_init(&rq->queuelist);
+	BUG_ON(bfqq->queued[sync] == 0);
+	bfqq->queued[sync]--;
+	bfqd->queued--;
+	elv_rb_del(&bfqq->sort_list, rq);
+
+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
+			bfq_del_bfqq_busy(bfqd, bfqq, 1);
+		/*
+		 * Remove queue from request-position tree as it is empty.
+		 */
+		if (bfqq->pos_root) {
+			rb_erase(&bfqq->pos_node, bfqq->pos_root);
+			bfqq->pos_root = NULL;
+		}
+	}
+
+	if (rq->cmd_flags & REQ_META) {
+		BUG_ON(bfqq->meta_pending == 0);
+		bfqq->meta_pending--;
+	}
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
+#endif
+}
+
+static int bfq_merge(struct request_queue *q, struct request **req,
+		     struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct request *__rq;
+
+	__rq = bfq_find_rq_fmerge(bfqd, bio);
+	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+		*req = __rq;
+		return ELEVATOR_FRONT_MERGE;
+	}
+
+	return ELEVATOR_NO_MERGE;
+}
+
+static void bfq_merged_request(struct request_queue *q, struct request *req,
+			       int type)
+{
+	if (type == ELEVATOR_FRONT_MERGE &&
+	    rb_prev(&req->rb_node) &&
+	    blk_rq_pos(req) <
+	    blk_rq_pos(container_of(rb_prev(&req->rb_node),
+				    struct request, rb_node))) {
+		struct bfq_queue *bfqq = RQ_BFQQ(req);
+		struct bfq_data *bfqd = bfqq->bfqd;
+		struct request *prev, *next_rq;
+
+		/* Reposition request in its sort_list */
+		elv_rb_del(&bfqq->sort_list, req);
+		elv_rb_add(&bfqq->sort_list, req);
+		/* Choose next request to be served for bfqq */
+		prev = bfqq->next_rq;
+		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
+					 bfqd->last_position);
+		BUG_ON(!next_rq);
+		bfqq->next_rq = next_rq;
+		/*
+		 * If next_rq changes, update both the queue's budget to
+		 * fit the new request and the queue's position in its
+		 * rq_pos_tree.
+		 */
+		if (prev != bfqq->next_rq) {
+			bfq_updated_next_req(bfqd, bfqq);
+			bfq_pos_tree_add_move(bfqd, bfqq);
+		}
+	}
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfq_bio_merged(struct request_queue *q, struct request *req,
+			   struct bio *bio)
+{
+	bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw);
+}
+#endif
+
+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
+				struct request *next)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+
+	/*
+	 * If next and rq belong to the same bfq_queue and next is older
+	 * than rq, then reposition rq in the fifo (by substituting next
+	 * with rq). Otherwise, if next and rq belong to different
+	 * bfq_queues, never reposition rq: in fact, we would have to
+	 * reposition it with respect to next's position in its own fifo,
+	 * which would most certainly be too expensive with respect to
+	 * the benefits.
+	 */
+	if (bfqq == next_bfqq &&
+	    !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+	    time_before(next->fifo_time, rq->fifo_time)) {
+		list_del_init(&rq->queuelist);
+		list_replace_init(&next->queuelist, &rq->queuelist);
+		rq->fifo_time = next->fifo_time;
+	}
+
+	if (bfqq->next_rq == next)
+		bfqq->next_rq = rq;
+
+	bfq_remove_request(next);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
+#endif
+}
+
+/* Must be called with bfqq != NULL */
+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+{
+	BUG_ON(!bfqq);
+	if (bfq_bfqq_busy(bfqq))
+		bfqq->bfqd->wr_busy_queues--;
+	bfqq->wr_coeff = 1;
+	bfqq->wr_cur_max_time = 0;
+	/* Trigger a weight change on the next activation of the queue */
+	bfqq->entity.prio_changed = 1;
+}
+
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+				    struct bfq_group *bfqg)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			if (bfqg->async_bfqq[i][j])
+				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
+	if (bfqg->async_idle_bfqq)
+		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+}
+
+static void bfq_end_wr(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq;
+
+	spin_lock_irq(bfqd->queue->queue_lock);
+
+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+		bfq_bfqq_end_wr(bfqq);
+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+		bfq_bfqq_end_wr(bfqq);
+	bfq_end_wr_async(bfqd);
+
+	spin_unlock_irq(bfqd->queue->queue_lock);
+}
+
+static sector_t bfq_io_struct_pos(void *io_struct, bool request)
+{
+	if (request)
+		return blk_rq_pos(io_struct);
+	else
+		return ((struct bio *)io_struct)->bi_iter.bi_sector;
+}
+
+static int bfq_rq_close_to_sector(void *io_struct, bool request,
+				  sector_t sector)
+{
+	return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
+	       BFQQ_SEEK_THR;
+}
+
+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
+					 struct bfq_queue *bfqq,
+					 sector_t sector)
+{
+	struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+	struct rb_node *parent, *node;
+	struct bfq_queue *__bfqq;
+
+	if (RB_EMPTY_ROOT(root))
+		return NULL;
+
+	/*
+	 * First, if we find a request starting at the end of the last
+	 * request, choose it.
+	 */
+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
+	if (__bfqq)
+		return __bfqq;
+
+	/*
+	 * If the exact sector wasn't found, the parent of the NULL leaf
+	 * will contain the closest sector (rq_pos_tree sorted by
+	 * next_request position).
+	 */
+	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+		return __bfqq;
+
+	if (blk_rq_pos(__bfqq->next_rq) < sector)
+		node = rb_next(&__bfqq->pos_node);
+	else
+		node = rb_prev(&__bfqq->pos_node);
+	if (!node)
+		return NULL;
+
+	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+		return __bfqq;
+
+	return NULL;
+}
+
+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
+						   struct bfq_queue *cur_bfqq,
+						   sector_t sector)
+{
+	struct bfq_queue *bfqq;
+
+	/*
+	 * We shall notice if some of the queues are cooperating,
+	 * e.g., working closely on the same area of the device. In
+	 * that case, we can group them together and: 1) don't waste
+	 * time idling, and 2) serve the union of their requests in
+	 * the best possible order for throughput.
+	 */
+	bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
+	if (!bfqq || bfqq == cur_bfqq)
+		return NULL;
+
+	return bfqq;
+}
+
+static struct bfq_queue *
+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+	int process_refs, new_process_refs;
+	struct bfq_queue *__bfqq;
+
+	/*
+	 * If there are no process references on the new_bfqq, then it is
+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
+	 * may have dropped their last reference (not just their last process
+	 * reference).
+	 */
+	if (!bfqq_process_refs(new_bfqq))
+		return NULL;
+
+	/* Avoid a circular list and skip interim queue merges. */
+	while ((__bfqq = new_bfqq->new_bfqq)) {
+		if (__bfqq == bfqq)
+			return NULL;
+		new_bfqq = __bfqq;
+	}
+
+	process_refs = bfqq_process_refs(bfqq);
+	new_process_refs = bfqq_process_refs(new_bfqq);
+	/*
+	 * If the process for the bfqq has gone away, there is no
+	 * sense in merging the queues.
+	 */
+	if (process_refs == 0 || new_process_refs == 0)
+		return NULL;
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
+		new_bfqq->pid);
+
+	/*
+	 * Merging is just a redirection: the requests of the process
+	 * owning one of the two queues are redirected to the other queue.
+	 * The latter queue, in its turn, is set as shared if this is the
+	 * first time that the requests of some process are redirected to
+	 * it.
+	 *
+	 * We redirect bfqq to new_bfqq and not the opposite, because we
+	 * are in the context of the process owning bfqq, hence we have
+	 * the io_cq of this process. So we can immediately configure this
+	 * io_cq to redirect the requests of the process to new_bfqq.
+	 *
+	 * NOTE, even if new_bfqq coincides with the in-service queue, the
+	 * io_cq of new_bfqq is not available, because, if the in-service
+	 * queue is shared, bfqd->in_service_bic may not point to the
+	 * io_cq of the in-service queue.
+	 * Redirecting the requests of the process owning bfqq to the
+	 * currently in-service queue is in any case the best option, as
+	 * we feed the in-service queue with new requests close to the
+	 * last request served and, by doing so, hopefully increase the
+	 * throughput.
+	 */
+	bfqq->new_bfqq = new_bfqq;
+	atomic_add(process_refs, &new_bfqq->ref);
+	return new_bfqq;
+}
+
+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
+					struct bfq_queue *new_bfqq)
+{
+	if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
+	    (bfqq->ioprio_class != new_bfqq->ioprio_class))
+		return false;
+
+	/*
+	 * If either of the queues has already been detected as seeky,
+	 * then merging it with the other queue is unlikely to lead to
+	 * sequential I/O.
+	 */
+	if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
+		return false;
+
+	/*
+	 * Interleaved I/O is known to be done by (some) applications
+	 * only for reads, so it does not make sense to merge async
+	 * queues.
+	 */
+	if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
+		return false;
+
+	return true;
+}
+
+/*
+ * Attempt to schedule a merge of bfqq with the currently in-service queue
+ * or with a close queue among the scheduled queues.
+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
+ * structure otherwise.
+ *
+ * The OOM queue is not allowed to participate to cooperation: in fact, since
+ * the requests temporarily redirected to the OOM queue could be redirected
+ * again to dedicated queues at any time, the state needed to correctly
+ * handle merging with the OOM queue would be quite complex and expensive
+ * to maintain. Besides, in such a critical condition as an out of memory,
+ * the benefits of queue merging may be little relevant, or even negligible.
+ */
+static struct bfq_queue *
+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		     void *io_struct, bool request)
+{
+	struct bfq_queue *in_service_bfqq, *new_bfqq;
+
+	if (bfqq->new_bfqq)
+		return bfqq->new_bfqq;
+	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
+		return NULL;
+	/* If device has only one backlogged bfq_queue, don't search. */
+	if (bfqd->busy_queues == 1)
+		return NULL;
+
+	in_service_bfqq = bfqd->in_service_queue;
+
+	if (!in_service_bfqq || in_service_bfqq == bfqq ||
+	    !bfqd->in_service_bic ||
+	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))
+		goto check_scheduled;
+
+	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+	    bfqq->entity.parent == in_service_bfqq->entity.parent &&
+	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
+		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
+		if (new_bfqq)
+			return new_bfqq;
+	}
+	/*
+	 * Check whether there is a cooperator among currently scheduled
+	 * queues. The only thing we need is that the bio/request is not
+	 * NULL, as we need it to establish whether a cooperator exists.
+	 */
+check_scheduled:
+	new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
+			bfq_io_struct_pos(io_struct, request));
+
+	BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);
+
+	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
+	    bfq_may_be_close_cooperator(bfqq, new_bfqq))
+		return bfq_setup_merge(bfqq, new_bfqq);
+
+	return NULL;
+}
+
+static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+{
+	/*
+	 * If !bfqq->bic, the queue is already shared or its requests
+	 * have already been redirected to a shared queue; both idle window
+	 * and weight raising state have already been saved. Do nothing.
+	 */
+	if (!bfqq->bic)
+		return;
+	if (bfqq->bic->wr_time_left)
+		/*
+		 * This is the queue of a just-started process, and would
+		 * deserve weight raising: we set wr_time_left to the full
+		 * weight-raising duration to trigger weight-raising when
+		 * and if the queue is split and the first request of the
+		 * queue is enqueued.
+		 */
+		bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);
+	else if (bfqq->wr_coeff > 1) {
+		unsigned long wr_duration =
+			jiffies - bfqq->last_wr_start_finish;
+		/*
+		 * It may happen that a queue's weight raising period lasts
+		 * longer than its wr_cur_max_time, as weight raising is
+		 * handled only when a request is enqueued or dispatched (it
+		 * does not use any timer). If the weight raising period is
+		 * about to end, don't save it.
+		 */
+		if (bfqq->wr_cur_max_time <= wr_duration)
+			bfqq->bic->wr_time_left = 0;
+		else
+			bfqq->bic->wr_time_left =
+				bfqq->wr_cur_max_time - wr_duration;
+		/*
+		 * The bfq_queue is becoming shared or the requests of the
+		 * process owning the queue are being redirected to a shared
+		 * queue. Stop the weight raising period of the queue, as in
+		 * both cases it should not be owned by an interactive or
+		 * soft real-time application.
+		 */
+		bfq_bfqq_end_wr(bfqq);
+	} else
+		bfqq->bic->wr_time_left = 0;
+	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
+	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+	bfqq->bic->cooperations++;
+	bfqq->bic->failed_cooperations = 0;
+}
+
+static void bfq_get_bic_reference(struct bfq_queue *bfqq)
+{
+	/*
+	 * If bfqq->bic has a non-NULL value, the bic to which it belongs
+	 * is about to begin using a shared bfq_queue.
+	 */
+	if (bfqq->bic)
+		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
+}
+
+static void
+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
+		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
+		(long unsigned)new_bfqq->pid);
+	/* Save weight raising and idle window of the merged queues */
+	bfq_bfqq_save_state(bfqq);
+	bfq_bfqq_save_state(new_bfqq);
+	if (bfq_bfqq_IO_bound(bfqq))
+		bfq_mark_bfqq_IO_bound(new_bfqq);
+	bfq_clear_bfqq_IO_bound(bfqq);
+	/*
+	 * Grab a reference to the bic, to prevent it from being destroyed
+	 * before being possibly touched by a bfq_split_bfqq().
+	 */
+	bfq_get_bic_reference(bfqq);
+	bfq_get_bic_reference(new_bfqq);
+	/*
+	 * Merge queues (that is, let bic redirect its requests to new_bfqq)
+	 */
+	bic_set_bfqq(bic, new_bfqq, 1);
+	bfq_mark_bfqq_coop(new_bfqq);
+	/*
+	 * new_bfqq now belongs to at least two bics (it is a shared queue):
+	 * set new_bfqq->bic to NULL. bfqq either:
+	 * - does not belong to any bic any more, and hence bfqq->bic must
+	 *   be set to NULL, or
+	 * - is a queue whose owning bics have already been redirected to a
+	 *   different queue, hence the queue is destined to not belong to
+	 *   any bic soon and bfqq->bic is already NULL (therefore the next
+	 *   assignment causes no harm).
+	 */
+	new_bfqq->bic = NULL;
+	bfqq->bic = NULL;
+	bfq_put_queue(bfqq);
+}
+
+static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
+{
+	struct bfq_io_cq *bic = bfqq->bic;
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {
+		bic->failed_cooperations++;
+		if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)
+			bic->cooperations = 0;
+	}
+}
+
+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
+			   struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_io_cq *bic;
+	struct bfq_queue *bfqq, *new_bfqq;
+
+	/*
+	 * Disallow merge of a sync bio into an async request.
+	 */
+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))
+		return 0;
+
+	/*
+	 * Lookup the bfqq that this bio will be queued with. Allow
+	 * merge only if rq is queued there.
+	 * Queue lock is held here.
+	 */
+	bic = bfq_bic_lookup(bfqd, current->io_context);
+	if (!bic)
+		return 0;
+
+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
+	/*
+	 * We take advantage of this function to perform an early merge
+	 * of the queues of possible cooperating processes.
+	 */
+	if (bfqq) {
+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
+		if (new_bfqq) {
+			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
+			/*
+			 * If we get here, the bio will be queued in the
+			 * shared queue, i.e., new_bfqq, so use new_bfqq
+			 * to decide whether bio and rq can be merged.
+			 */
+			bfqq = new_bfqq;
+		} else
+			bfq_bfqq_increase_failed_cooperations(bfqq);
+	}
+
+	return bfqq == RQ_BFQQ(rq);
+}
+
+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
+				       struct bfq_queue *bfqq)
+{
+	if (bfqq) {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
+#endif
+		bfq_mark_bfqq_must_alloc(bfqq);
+		bfq_mark_bfqq_budget_new(bfqq);
+		bfq_clear_bfqq_fifo_expire(bfqq);
+
+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
+
+		bfq_log_bfqq(bfqd, bfqq,
+			     "set_in_service_queue, cur-budget = %d",
+			     bfqq->entity.budget);
+	}
+
+	bfqd->in_service_queue = bfqq;
+}
+
+/*
+ * Get and set a new queue for service.
+ */
+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
+
+	__bfq_set_in_service_queue(bfqd, bfqq);
+	return bfqq;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static int bfq_max_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+		return bfq_default_max_budget;
+	else
+		return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static int bfq_min_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+		return bfq_default_max_budget / 32;
+	else
+		return bfqd->bfq_max_budget / 32;
+}
+
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfqd->in_service_queue;
+	struct bfq_io_cq *bic;
+	unsigned long sl;
+
+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	/* Processes have exited, don't wait. */
+	bic = bfqd->in_service_bic;
+	if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
+		return;
+
+	bfq_mark_bfqq_wait_request(bfqq);
+
+	/*
+	 * We don't want to idle for seeks, but we do want to allow
+	 * fair distribution of slice time for a process doing back-to-back
+	 * seeks. So allow a little bit of time for him to submit a new rq.
+	 *
+	 * To prevent processes with (partly) seeky workloads from
+	 * being too ill-treated, grant them a small fraction of the
+	 * assigned budget before reducing the waiting time to
+	 * BFQ_MIN_TT. This happened to help reduce latency.
+	 */
+	sl = bfqd->bfq_slice_idle;
+	/*
+	 * Unless the queue is being weight-raised or the scenario is
+	 * asymmetric, grant only minimum idle time if the queue either
+	 * has been seeky for long enough or has already proved to be
+	 * constantly seeky.
+	 */
+	if (bfq_sample_valid(bfqq->seek_samples) &&
+	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
+				  bfq_max_budget(bfqq->bfqd) / 8) ||
+	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&
+	    bfq_symmetric_scenario(bfqd))
+		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
+	else if (bfqq->wr_coeff > 1)
+		sl = sl * 3;
+	bfqd->last_idling_start = ktime_get();
+	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
+#endif
+	bfq_log(bfqd, "arm idle: %u/%u ms",
+		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
+}
+
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the disk
+ * throughput (always guaranteed with a time slice scheme as in CFQ).
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfqd->in_service_queue;
+	unsigned int timeout_coeff;
+	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
+		timeout_coeff = 1;
+	else
+		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
+	bfqd->last_budget_start = ktime_get();
+
+	bfq_clear_bfqq_budget_new(bfqq);
+	bfqq->budget_timeout = jiffies +
+		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
+
+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
+		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
+		timeout_coeff));
+}
+
+/*
+ * Move request from internal lists to the request queue dispatch list.
+ */
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	/*
+	 * For consistency, the next instruction should have been executed
+	 * after removing the request from the queue and dispatching it.
+	 * We execute instead this instruction before bfq_remove_request()
+	 * (and hence introduce a temporary inconsistency), for efficiency.
+	 * In fact, in a forced_dispatch, this prevents two counters related
+	 * to bfqq->dispatched to risk to be uselessly decremented if bfqq
+	 * is not in service, and then to be incremented again after
+	 * incrementing bfqq->dispatched.
+	 */
+	bfqq->dispatched++;
+	bfq_remove_request(rq);
+	elv_dispatch_sort(q, rq);
+
+	if (bfq_bfqq_sync(bfqq))
+		bfqd->sync_flight++;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq),
+				   rq->cmd_flags);
+#endif
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
+{
+	struct request *rq = NULL;
+
+	if (bfq_bfqq_fifo_expire(bfqq))
+		return NULL;
+
+	bfq_mark_bfqq_fifo_expire(bfqq);
+
+	if (list_empty(&bfqq->fifo))
+		return NULL;
+
+	rq = rq_entry_fifo(bfqq->fifo.next);
+
+	if (time_before(jiffies, rq->fifo_time))
+		return NULL;
+
+	return rq;
+}
+
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	return entity->budget - entity->service;
+}
+
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	BUG_ON(bfqq != bfqd->in_service_queue);
+
+	__bfq_bfqd_reset_in_service(bfqd);
+
+	/*
+	 * If this bfqq is shared between multiple processes, check
+	 * to make sure that those processes are still issuing I/Os
+	 * within the mean seek distance. If not, it may be time to
+	 * break the queues apart again.
+	 */
+	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
+		bfq_mark_bfqq_split_coop(bfqq);
+
+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		/*
+		 * Overloading budget_timeout field to store the time
+		 * at which the queue remains with no backlog; used by
+		 * the weight-raising mechanism.
+		 */
+		bfqq->budget_timeout = jiffies;
+		bfq_del_bfqq_busy(bfqd, bfqq, 1);
+	} else {
+		bfq_activate_bfqq(bfqd, bfqq);
+		/*
+		 * Resort priority tree of potential close cooperators.
+		 */
+		bfq_pos_tree_add_move(bfqd, bfqq);
+	}
+}
+
+/**
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
+ * @bfqd: device data.
+ * @bfqq: queue to update.
+ * @reason: reason for expiration.
+ *
+ * Handle the feedback on @bfqq budget at queue expiration.
+ * See the body for detailed comments.
+ */
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+				     struct bfq_queue *bfqq,
+				     enum bfqq_expiration reason)
+{
+	struct request *next_rq;
+	int budget, min_budget;
+
+	budget = bfqq->max_budget;
+	min_budget = bfq_min_budget(bfqd);
+
+	BUG_ON(bfqq != bfqd->in_service_queue);
+
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
+		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
+		budget, bfq_min_budget(bfqd));
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
+		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
+
+	if (bfq_bfqq_sync(bfqq)) {
+		switch (reason) {
+		/*
+		 * Caveat: in all the following cases we trade latency
+		 * for throughput.
+		 */
+		case BFQ_BFQQ_TOO_IDLE:
+			/*
+			 * This is the only case where we may reduce
+			 * the budget: if there is no request of the
+			 * process still waiting for completion, then
+			 * we assume (tentatively) that the timer has
+			 * expired because the batch of requests of
+			 * the process could have been served with a
+			 * smaller budget.  Hence, betting that
+			 * process will behave in the same way when it
+			 * becomes backlogged again, we reduce its
+			 * next budget.  As long as we guess right,
+			 * this budget cut reduces the latency
+			 * experienced by the process.
+			 *
+			 * However, if there are still outstanding
+			 * requests, then the process may have not yet
+			 * issued its next request just because it is
+			 * still waiting for the completion of some of
+			 * the still outstanding ones.  So in this
+			 * subcase we do not reduce its budget, on the
+			 * contrary we increase it to possibly boost
+			 * the throughput, as discussed in the
+			 * comments to the BUDGET_TIMEOUT case.
+			 */
+			if (bfqq->dispatched > 0) /* still outstanding reqs */
+				budget = min(budget * 2, bfqd->bfq_max_budget);
+			else {
+				if (budget > 5 * min_budget)
+					budget -= 4 * min_budget;
+				else
+					budget = min_budget;
+			}
+			break;
+		case BFQ_BFQQ_BUDGET_TIMEOUT:
+			/*
+			 * We double the budget here because: 1) it
+			 * gives the chance to boost the throughput if
+			 * this is not a seeky process (which may have
+			 * bumped into this timeout because of, e.g.,
+			 * ZBR), 2) together with charge_full_budget
+			 * it helps give seeky processes higher
+			 * timestamps, and hence be served less
+			 * frequently.
+			 */
+			budget = min(budget * 2, bfqd->bfq_max_budget);
+			break;
+		case BFQ_BFQQ_BUDGET_EXHAUSTED:
+			/*
+			 * The process still has backlog, and did not
+			 * let either the budget timeout or the disk
+			 * idling timeout expire. Hence it is not
+			 * seeky, has a short thinktime and may be
+			 * happy with a higher budget too. So
+			 * definitely increase the budget of this good
+			 * candidate to boost the disk throughput.
+			 */
+			budget = min(budget * 4, bfqd->bfq_max_budget);
+			break;
+		case BFQ_BFQQ_NO_MORE_REQUESTS:
+		       /*
+			* Leave the budget unchanged.
+			*/
+		default:
+			return;
+		}
+	} else
+		/*
+		 * Async queues get always the maximum possible budget
+		 * (their ability to dispatch is limited by
+		 * @bfqd->bfq_max_budget_async_rq).
+		 */
+		budget = bfqd->bfq_max_budget;
+
+	bfqq->max_budget = budget;
+
+	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
+	    !bfqd->bfq_user_max_budget)
+		bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
+
+	/*
+	 * Make sure that we have enough budget for the next request.
+	 * Since the finish time of the bfqq must be kept in sync with
+	 * the budget, be sure to call __bfq_bfqq_expire() after the
+	 * update.
+	 */
+	next_rq = bfqq->next_rq;
+	if (next_rq)
+		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
+					    bfq_serv_to_charge(next_rq, bfqq));
+	else
+		bfqq->entity.budget = bfqq->max_budget;
+
+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
+			next_rq ? blk_rq_sectors(next_rq) : 0,
+			bfqq->entity.budget);
+}
+
+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
+{
+	unsigned long max_budget;
+
+	/*
+	 * The max_budget calculated when autotuning is equal to the
+	 * amount of sectors transfered in timeout_sync at the
+	 * estimated peak rate.
+	 */
+	max_budget = (unsigned long)(peak_rate * 1000 *
+				     timeout >> BFQ_RATE_SHIFT);
+
+	return max_budget;
+}
+
+/*
+ * In addition to updating the peak rate, checks whether the process
+ * is "slow", and returns 1 if so. This slow flag is used, in addition
+ * to the budget timeout, to reduce the amount of service provided to
+ * seeky processes, and hence reduce their chances to lower the
+ * throughput. See the code for more details.
+ */
+static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				 bool compensate, enum bfqq_expiration reason)
+{
+	u64 bw, usecs, expected, timeout;
+	ktime_t delta;
+	int update = 0;
+
+	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
+		return false;
+
+	if (compensate)
+		delta = bfqd->last_idling_start;
+	else
+		delta = ktime_get();
+	delta = ktime_sub(delta, bfqd->last_budget_start);
+	usecs = ktime_to_us(delta);
+
+	/* Don't trust short/unrealistic values. */
+	if (usecs < 100 || usecs >= LONG_MAX)
+		return false;
+
+	/*
+	 * Calculate the bandwidth for the last slice.  We use a 64 bit
+	 * value to store the peak rate, in sectors per usec in fixed
+	 * point math.  We do so to have enough precision in the estimate
+	 * and to avoid overflows.
+	 */
+	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
+	do_div(bw, (unsigned long)usecs);
+
+	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
+
+	/*
+	 * Use only long (> 20ms) intervals to filter out spikes for
+	 * the peak rate estimation.
+	 */
+	if (usecs > 20000) {
+		if (bw > bfqd->peak_rate ||
+		   (!BFQQ_SEEKY(bfqq) &&
+		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
+			bfq_log(bfqd, "measured bw =%llu", bw);
+			/*
+			 * To smooth oscillations use a low-pass filter with
+			 * alpha=7/8, i.e.,
+			 * new_rate = (7/8) * old_rate + (1/8) * bw
+			 */
+			do_div(bw, 8);
+			if (bw == 0)
+				return 0;
+			bfqd->peak_rate *= 7;
+			do_div(bfqd->peak_rate, 8);
+			bfqd->peak_rate += bw;
+			update = 1;
+			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
+		}
+
+		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
+
+		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
+			bfqd->peak_rate_samples++;
+
+		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
+		    update) {
+			int dev_type = blk_queue_nonrot(bfqd->queue);
+			if (bfqd->bfq_user_max_budget == 0) {
+				bfqd->bfq_max_budget =
+					bfq_calc_max_budget(bfqd->peak_rate,
+							    timeout);
+				bfq_log(bfqd, "new max_budget=%d",
+					bfqd->bfq_max_budget);
+			}
+			if (bfqd->device_speed == BFQ_BFQD_FAST &&
+			    bfqd->peak_rate < device_speed_thresh[dev_type]) {
+				bfqd->device_speed = BFQ_BFQD_SLOW;
+				bfqd->RT_prod = R_slow[dev_type] *
+						T_slow[dev_type];
+			} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+			    bfqd->peak_rate > device_speed_thresh[dev_type]) {
+				bfqd->device_speed = BFQ_BFQD_FAST;
+				bfqd->RT_prod = R_fast[dev_type] *
+						T_fast[dev_type];
+			}
+		}
+	}
+
+	/*
+	 * If the process has been served for a too short time
+	 * interval to let its possible sequential accesses prevail on
+	 * the initial seek time needed to move the disk head on the
+	 * first sector it requested, then give the process a chance
+	 * and for the moment return false.
+	 */
+	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
+		return false;
+
+	/*
+	 * A process is considered ``slow'' (i.e., seeky, so that we
+	 * cannot treat it fairly in the service domain, as it would
+	 * slow down too much the other processes) if, when a slice
+	 * ends for whatever reason, it has received service at a
+	 * rate that would not be high enough to complete the budget
+	 * before the budget timeout expiration.
+	 */
+	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
+
+	/*
+	 * Caveat: processes doing IO in the slower disk zones will
+	 * tend to be slow(er) even if not seeky. And the estimated
+	 * peak rate will actually be an average over the disk
+	 * surface. Hence, to not be too harsh with unlucky processes,
+	 * we keep a budget/3 margin of safety before declaring a
+	 * process slow.
+	 */
+	return expected > (4 * bfqq->entity.budget) / 3;
+}
+
+/*
+ * To be deemed as soft real-time, an application must meet two
+ * requirements. First, the application must not require an average
+ * bandwidth higher than the approximate bandwidth required to playback or
+ * record a compressed high-definition video.
+ * The next function is invoked on the completion of the last request of a
+ * batch, to compute the next-start time instant, soft_rt_next_start, such
+ * that, if the next request of the application does not arrive before
+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
+ *
+ * The second requirement is that the request pattern of the application is
+ * isochronous, i.e., that, after issuing a request or a batch of requests,
+ * the application stops issuing new requests until all its pending requests
+ * have been completed. After that, the application may issue a new batch,
+ * and so on.
+ * For this reason the next function is invoked to compute
+ * soft_rt_next_start only for applications that meet this requirement,
+ * whereas soft_rt_next_start is set to infinity for applications that do
+ * not.
+ *
+ * Unfortunately, even a greedy application may happen to behave in an
+ * isochronous way if the CPU load is high. In fact, the application may
+ * stop issuing requests while the CPUs are busy serving other processes,
+ * then restart, then stop again for a while, and so on. In addition, if
+ * the disk achieves a low enough throughput with the request pattern
+ * issued by the application (e.g., because the request pattern is random
+ * and/or the device is slow), then the application may meet the above
+ * bandwidth requirement too. To prevent such a greedy application to be
+ * deemed as soft real-time, a further rule is used in the computation of
+ * soft_rt_next_start: soft_rt_next_start must be higher than the current
+ * time plus the maximum time for which the arrival of a request is waited
+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
+ * This filters out greedy applications, as the latter issue instead their
+ * next request as soon as possible after the last one has been completed
+ * (in contrast, when a batch of requests is completed, a soft real-time
+ * application spends some time processing data).
+ *
+ * Unfortunately, the last filter may easily generate false positives if
+ * only bfqd->bfq_slice_idle is used as a reference time interval and one
+ * or both the following cases occur:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
+ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
+ *    HZ=100.
+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
+ *    for a while, then suddenly 'jump' by several units to recover the lost
+ *    increments. This seems to happen, e.g., inside virtual machines.
+ * To address this issue, we do not use as a reference time interval just
+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
+ * particular we add the minimum number of jiffies for which the filter
+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual
+ * machines.
+ */
+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+						struct bfq_queue *bfqq)
+{
+	return max(bfqq->last_idle_bklogged +
+		   HZ * bfqq->service_from_backlogged /
+		   bfqd->bfq_wr_max_softrt_rate,
+		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);
+}
+
+/*
+ * Return the largest-possible time instant such that, for as long as possible,
+ * the current time will be lower than this time instant according to the macro
+ * time_is_before_jiffies().
+ */
+static unsigned long bfq_infinity_from_now(unsigned long now)
+{
+	return now + ULONG_MAX / 2;
+}
+
+/**
+ * bfq_bfqq_expire - expire a queue.
+ * @bfqd: device owning the queue.
+ * @bfqq: the queue to expire.
+ * @compensate: if true, compensate for the time spent idling.
+ * @reason: the reason causing the expiration.
+ *
+ *
+ * If the process associated to the queue is slow (i.e., seeky), or in
+ * case of budget timeout, or, finally, if it is async, we
+ * artificially charge it an entire budget (independently of the
+ * actual service it received). As a consequence, the queue will get
+ * higher timestamps than the correct ones upon reactivation, and
+ * hence it will be rescheduled as if it had received more service
+ * than what it actually received. In the end, this class of processes
+ * will receive less service in proportion to how slowly they consume
+ * their budgets (and hence how seriously they tend to lower the
+ * throughput).
+ *
+ * In contrast, when a queue expires because it has been idling for
+ * too much or because it exhausted its budget, we do not touch the
+ * amount of service it has received. Hence when the queue will be
+ * reactivated and its timestamps updated, the latter will be in sync
+ * with the actual service received by the queue until expiration.
+ *
+ * Charging a full budget to the first type of queues and the exact
+ * service to the others has the effect of using the WF2Q+ policy to
+ * schedule the former on a timeslice basis, without violating the
+ * service domain guarantees of the latter.
+ */
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason)
+{
+	bool slow;
+	BUG_ON(bfqq != bfqd->in_service_queue);
+
+	/*
+	 * Update disk peak rate for autotuning and check whether the
+	 * process is slow (see bfq_update_peak_rate).
+	 */
+	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
+
+	/*
+	 * As above explained, 'punish' slow (i.e., seeky), timed-out
+	 * and async queues, to favor sequential sync workloads.
+	 *
+	 * Processes doing I/O in the slower disk zones will tend to be
+	 * slow(er) even if not seeky. Hence, since the estimated peak
+	 * rate is actually an average over the disk surface, these
+	 * processes may timeout just for bad luck. To avoid punishing
+	 * them we do not charge a full budget to a process that
+	 * succeeded in consuming at least 2/3 of its budget.
+	 */
+	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
+		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))
+		bfq_bfqq_charge_full_budget(bfqq);
+
+	bfqq->service_from_backlogged += bfqq->entity.service;
+
+	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
+	    !bfq_bfqq_constantly_seeky(bfqq)) {
+		bfq_mark_bfqq_constantly_seeky(bfqq);
+		if (!blk_queue_nonrot(bfqd->queue))
+			bfqd->const_seeky_busy_in_flight_queues++;
+	}
+
+	if (reason == BFQ_BFQQ_TOO_IDLE &&
+	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )
+		bfq_clear_bfqq_IO_bound(bfqq);
+
+	if (bfqd->low_latency && bfqq->wr_coeff == 1)
+		bfqq->last_wr_start_finish = jiffies;
+
+	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
+	    RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		/*
+		 * If we get here, and there are no outstanding requests,
+		 * then the request pattern is isochronous (see the comments
+		 * to the function bfq_bfqq_softrt_next_start()). Hence we
+		 * can compute soft_rt_next_start. If, instead, the queue
+		 * still has outstanding requests, then we have to wait
+		 * for the completion of all the outstanding requests to
+		 * discover whether the request pattern is actually
+		 * isochronous.
+		 */
+		if (bfqq->dispatched == 0)
+			bfqq->soft_rt_next_start =
+				bfq_bfqq_softrt_next_start(bfqd, bfqq);
+		else {
+			/*
+			 * The application is still waiting for the
+			 * completion of one or more requests:
+			 * prevent it from possibly being incorrectly
+			 * deemed as soft real-time by setting its
+			 * soft_rt_next_start to infinity. In fact,
+			 * without this assignment, the application
+			 * would be incorrectly deemed as soft
+			 * real-time if:
+			 * 1) it issued a new request before the
+			 *    completion of all its in-flight
+			 *    requests, and
+			 * 2) at that time, its soft_rt_next_start
+			 *    happened to be in the past.
+			 */
+			bfqq->soft_rt_next_start =
+				bfq_infinity_from_now(jiffies);
+			/*
+			 * Schedule an update of soft_rt_next_start to when
+			 * the task may be discovered to be isochronous.
+			 */
+			bfq_mark_bfqq_softrt_update(bfqq);
+		}
+	}
+
+	bfq_log_bfqq(bfqd, bfqq,
+		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
+		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
+
+	/*
+	 * Increase, decrease or leave budget unchanged according to
+	 * reason.
+	 */
+	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+	__bfq_bfqq_expire(bfqd, bfqq);
+}
+
+/*
+ * Budget timeout is not implemented through a dedicated timer, but
+ * just checked on request arrivals and completions, as well as on
+ * idle timer expirations.
+ */
+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+{
+	if (bfq_bfqq_budget_new(bfqq) ||
+	    time_before(jiffies, bfqq->budget_timeout))
+		return false;
+	return true;
+}
+
+/*
+ * If we expire a queue that is waiting for the arrival of a new
+ * request, we may prevent the fictitious timestamp back-shifting that
+ * allows the guarantees of the queue to be preserved (see [1] for
+ * this tricky aspect). Hence we return true only if this condition
+ * does not hold, or if the queue is slow enough to deserve only to be
+ * kicked off for preserving a high throughput.
+*/
+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		"may_budget_timeout: wait_request %d left %d timeout %d",
+		bfq_bfqq_wait_request(bfqq),
+			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
+		bfq_bfqq_budget_timeout(bfqq));
+
+	return (!bfq_bfqq_wait_request(bfqq) ||
+		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
+		&&
+		bfq_bfqq_budget_timeout(bfqq);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for that queue. As a consequence, since
+ * device idling plays a critical role for both throughput boosting
+ * and service guarantees, the return value of this function plays a
+ * critical role as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * In more detail, the return value of this function is obtained by,
+ * first, computing a number of boolean variables that take into
+ * account throughput and service-guarantee issues, and, then,
+ * combining these variables in a logical expression. Most of the
+ * issues taken into account are not trivial. We discuss these issues
+ * while introducing the variables.
+ */
+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+	bool idling_boosts_thr, idling_boosts_thr_without_issues,
+		all_queues_seeky, on_hdd_and_not_all_queues_seeky,
+		idling_needed_for_service_guarantees,
+		asymmetric_scenario;
+
+	/*
+	 * The next variable takes into account the cases where idling
+	 * boosts the throughput.
+	 *
+	 * The value of the variable is computed considering, first, that
+	 * idling is virtually always beneficial for the throughput if:
+	 * (a) the device is not NCQ-capable, or
+	 * (b) regardless of the presence of NCQ, the device is rotational
+	 *     and the request pattern for bfqq is I/O-bound and sequential.
+	 *
+	 * Secondly, and in contrast to the above item (b), idling an
+	 * NCQ-capable flash-based device would not boost the
+	 * throughput even with sequential I/O; rather it would lower
+	 * the throughput in proportion to how fast the device
+	 * is. Accordingly, the next variable is true if any of the
+	 * above conditions (a) and (b) is true, and, in particular,
+	 * happens to be false if bfqd is an NCQ-capable flash-based
+	 * device.
+	 */
+	idling_boosts_thr = !bfqd->hw_tag ||
+		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
+		 bfq_bfqq_idle_window(bfqq)) ;
+
+	/*
+	 * The value of the next variable,
+	 * idling_boosts_thr_without_issues, is equal to that of
+	 * idling_boosts_thr, unless a special case holds. In this
+	 * special case, described below, idling may cause problems to
+	 * weight-raised queues.
+	 *
+	 * When the request pool is saturated (e.g., in the presence
+	 * of write hogs), if the processes associated with
+	 * non-weight-raised queues ask for requests at a lower rate,
+	 * then processes associated with weight-raised queues have a
+	 * higher probability to get a request from the pool
+	 * immediately (or at least soon) when they need one. Thus
+	 * they have a higher probability to actually get a fraction
+	 * of the device throughput proportional to their high
+	 * weight. This is especially true with NCQ-capable drives,
+	 * which enqueue several requests in advance, and further
+	 * reorder internally-queued requests.
+	 *
+	 * For this reason, we force to false the value of
+	 * idling_boosts_thr_without_issues if there are weight-raised
+	 * busy queues. In this case, and if bfqq is not weight-raised,
+	 * this guarantees that the device is not idled for bfqq (if,
+	 * instead, bfqq is weight-raised, then idling will be
+	 * guaranteed by another variable, see below). Combined with
+	 * the timestamping rules of BFQ (see [1] for details), this
+	 * behavior causes bfqq, and hence any sync non-weight-raised
+	 * queue, to get a lower number of requests served, and thus
+	 * to ask for a lower number of requests from the request
+	 * pool, before the busy weight-raised queues get served
+	 * again. This often mitigates starvation problems in the
+	 * presence of heavy write workloads and NCQ, thereby
+	 * guaranteeing a higher application and system responsiveness
+	 * in these hostile scenarios.
+	 */
+	idling_boosts_thr_without_issues = idling_boosts_thr &&
+		bfqd->wr_busy_queues == 0;
+
+	/*
+	 * There are then two cases where idling must be performed not
+	 * for throughput concerns, but to preserve service
+	 * guarantees. In the description of these cases, we say, for
+	 * short, that a queue is sequential/random if the process
+	 * associated to the queue issues sequential/random requests
+	 * (in the second case the queue may be tagged as seeky or
+	 * even constantly_seeky).
+	 *
+	 * To introduce the first case, we note that, since
+	 * bfq_bfqq_idle_window(bfqq) is false if the device is
+	 * NCQ-capable and bfqq is random (see
+	 * bfq_update_idle_window()), then, from the above two
+	 * assignments it follows that
+	 * idling_boosts_thr_without_issues is false if the device is
+	 * NCQ-capable and bfqq is random. Therefore, for this case,
+	 * device idling would never be allowed if we used just
+	 * idling_boosts_thr_without_issues to decide whether to allow
+	 * it. And, beneficially, this would imply that throughput
+	 * would always be boosted also with random I/O on NCQ-capable
+	 * HDDs.
+	 *
+	 * But we must be careful on this point, to avoid an unfair
+	 * treatment for bfqq. In fact, because of the same above
+	 * assignments, idling_boosts_thr_without_issues is, on the
+	 * other hand, true if 1) the device is an HDD and bfqq is
+	 * sequential, and 2) there are no busy weight-raised
+	 * queues. As a consequence, if we used just
+	 * idling_boosts_thr_without_issues to decide whether to idle
+	 * the device, then with an HDD we might easily bump into a
+	 * scenario where queues that are sequential and I/O-bound
+	 * would enjoy idling, whereas random queues would not. The
+	 * latter might then get a low share of the device throughput,
+	 * simply because the former would get many requests served
+	 * after being set as in service, while the latter would not.
+	 *
+	 * To address this issue, we start by setting to true a
+	 * sentinel variable, on_hdd_and_not_all_queues_seeky, if the
+	 * device is rotational and not all queues with pending or
+	 * in-flight requests are constantly seeky (i.e., there are
+	 * active sequential queues, and bfqq might then be mistreated
+	 * if it does not enjoy idling because it is random).
+	 */
+	all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) &&
+			   bfqd->busy_in_flight_queues ==
+			   bfqd->const_seeky_busy_in_flight_queues;
+
+	on_hdd_and_not_all_queues_seeky =
+		!blk_queue_nonrot(bfqd->queue) && !all_queues_seeky;
+
+	/*
+	 * To introduce the second case where idling needs to be
+	 * performed to preserve service guarantees, we can note that
+	 * allowing the drive to enqueue more than one request at a
+	 * time, and hence delegating de facto final scheduling
+	 * decisions to the drive's internal scheduler, causes loss of
+	 * control on the actual request service order. In particular,
+	 * the critical situation is when requests from different
+	 * processes happens to be present, at the same time, in the
+	 * internal queue(s) of the drive. In such a situation, the
+	 * drive, by deciding the service order of the
+	 * internally-queued requests, does determine also the actual
+	 * throughput distribution among these processes. But the
+	 * drive typically has no notion or concern about per-process
+	 * throughput distribution, and makes its decisions only on a
+	 * per-request basis. Therefore, the service distribution
+	 * enforced by the drive's internal scheduler is likely to
+	 * coincide with the desired device-throughput distribution
+	 * only in a completely symmetric scenario where:
+	 * (i)  each of these processes must get the same throughput as
+	 *      the others;
+	 * (ii) all these processes have the same I/O pattern
+	        (either sequential or random).
+	 * In fact, in such a scenario, the drive will tend to treat
+	 * the requests of each of these processes in about the same
+	 * way as the requests of the others, and thus to provide
+	 * each of these processes with about the same throughput
+	 * (which is exactly the desired throughput distribution). In
+	 * contrast, in any asymmetric scenario, device idling is
+	 * certainly needed to guarantee that bfqq receives its
+	 * assigned fraction of the device throughput (see [1] for
+	 * details).
+	 *
+	 * We address this issue by controlling, actually, only the
+	 * symmetry sub-condition (i), i.e., provided that
+	 * sub-condition (i) holds, idling is not performed,
+	 * regardless of whether sub-condition (ii) holds. In other
+	 * words, only if sub-condition (i) holds, then idling is
+	 * allowed, and the device tends to be prevented from queueing
+	 * many requests, possibly of several processes. The reason
+	 * for not controlling also sub-condition (ii) is that, first,
+	 * in the case of an HDD, the asymmetry in terms of types of
+	 * I/O patterns is already taken in to account in the above
+	 * sentinel variable
+	 * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a
+	 * flash-based device, we prefer however to privilege
+	 * throughput (and idling lowers throughput for this type of
+	 * devices), for the following reasons:
+	 * 1) differently from HDDs, the service time of random
+	 *    requests is not orders of magnitudes lower than the service
+	 *    time of sequential requests; thus, even if processes doing
+	 *    sequential I/O get a preferential treatment with respect to
+	 *    others doing random I/O, the consequences are not as
+	 *    dramatic as with HDDs;
+	 * 2) if a process doing random I/O does need strong
+	 *    throughput guarantees, it is hopefully already being
+	 *    weight-raised, or the user is likely to have assigned it a
+	 *    higher weight than the other processes (and thus
+	 *    sub-condition (i) is likely to be false, which triggers
+	 *    idling).
+	 *
+	 * According to the above considerations, the next variable is
+	 * true (only) if sub-condition (i) holds. To compute the
+	 * value of this variable, we not only use the return value of
+	 * the function bfq_symmetric_scenario(), but also check
+	 * whether bfqq is being weight-raised, because
+	 * bfq_symmetric_scenario() does not take into account also
+	 * weight-raised queues (see comments to
+	 * bfq_weights_tree_add()).
+	 *
+	 * As a side note, it is worth considering that the above
+	 * device-idling countermeasures may however fail in the
+	 * following unlucky scenario: if idling is (correctly)
+	 * disabled in a time period during which all symmetry
+	 * sub-conditions hold, and hence the device is allowed to
+	 * enqueue many requests, but at some later point in time some
+	 * sub-condition stops to hold, then it may become impossible
+	 * to let requests be served in the desired order until all
+	 * the requests already queued in the device have been served.
+	 */
+	asymmetric_scenario = bfqq->wr_coeff > 1 ||
+		!bfq_symmetric_scenario(bfqd);
+
+	/*
+	 * Finally, there is a case where maximizing throughput is the
+	 * best choice even if it may cause unfairness toward
+	 * bfqq. Such a case is when bfqq became active in a burst of
+	 * queue activations. Queues that became active during a large
+	 * burst benefit only from throughput, as discussed in the
+	 * comments to bfq_handle_burst. Thus, if bfqq became active
+	 * in a burst and not idling the device maximizes throughput,
+	 * then the device must no be idled, because not idling the
+	 * device provides bfqq and all other queues in the burst with
+	 * maximum benefit. Combining this and the two cases above, we
+	 * can now establish when idling is actually needed to
+	 * preserve service guarantees.
+	 */
+	idling_needed_for_service_guarantees =
+		(on_hdd_and_not_all_queues_seeky || asymmetric_scenario) &&
+		!bfq_bfqq_in_large_burst(bfqq);
+
+	/*
+	 * We have now all the components we need to compute the return
+	 * value of the function, which is true only if both the following
+	 * conditions hold:
+	 * 1) bfqq is sync, because idling make sense only for sync queues;
+	 * 2) idling either boosts the throughput (without issues), or
+	 *    is necessary to preserve service guarantees.
+	 */
+	return bfq_bfqq_sync(bfqq) &&
+		(idling_boosts_thr_without_issues ||
+		 idling_needed_for_service_guarantees);
+}
+
+/*
+ * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * returns true, then:
+ * 1) the queue must remain in service and cannot be expired, and
+ * 2) the device must be idled to wait for the possible arrival of a new
+ *    request for the queue.
+ * See the comments to the function bfq_bfqq_may_idle for the reasons
+ * why performing device idling is the best choice to boost the throughput
+ * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * returns true.
+ */
+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
+	       bfq_bfqq_may_idle(bfqq);
+}
+
+/*
+ * Select a queue for service.  If we have a current queue in service,
+ * check whether to continue servicing it, or retrieve and set a new one.
+ */
+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq;
+	struct request *next_rq;
+	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+
+	bfqq = bfqd->in_service_queue;
+	if (!bfqq)
+		goto new_queue;
+
+	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
+
+	if (bfq_may_expire_for_budg_timeout(bfqq) &&
+	    !timer_pending(&bfqd->idle_slice_timer) &&
+	    !bfq_bfqq_must_idle(bfqq))
+		goto expire;
+
+	next_rq = bfqq->next_rq;
+	/*
+	 * If bfqq has requests queued and it has enough budget left to
+	 * serve them, keep the queue, otherwise expire it.
+	 */
+	if (next_rq) {
+		if (bfq_serv_to_charge(next_rq, bfqq) >
+			bfq_bfqq_budget_left(bfqq)) {
+			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
+			goto expire;
+		} else {
+			/*
+			 * The idle timer may be pending because we may
+			 * not disable disk idling even when a new request
+			 * arrives.
+			 */
+			if (timer_pending(&bfqd->idle_slice_timer)) {
+				/*
+				 * If we get here: 1) at least a new request
+				 * has arrived but we have not disabled the
+				 * timer because the request was too small,
+				 * 2) then the block layer has unplugged
+				 * the device, causing the dispatch to be
+				 * invoked.
+				 *
+				 * Since the device is unplugged, now the
+				 * requests are probably large enough to
+				 * provide a reasonable throughput.
+				 * So we disable idling.
+				 */
+				bfq_clear_bfqq_wait_request(bfqq);
+				del_timer(&bfqd->idle_slice_timer);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+				bfqg_stats_update_idle_time(bfqq_group(bfqq));
+#endif
+			}
+			goto keep_queue;
+		}
+	}
+
+	/*
+	 * No requests pending. However, if the in-service queue is idling
+	 * for a new request, or has requests waiting for a completion and
+	 * may idle after their completion, then keep it anyway.
+	 */
+	if (timer_pending(&bfqd->idle_slice_timer) ||
+	    (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+		bfqq = NULL;
+		goto keep_queue;
+	}
+
+	reason = BFQ_BFQQ_NO_MORE_REQUESTS;
+expire:
+	bfq_bfqq_expire(bfqd, bfqq, false, reason);
+new_queue:
+	bfqq = bfq_set_in_service_queue(bfqd);
+	bfq_log(bfqd, "select_queue: new queue %d returned",
+		bfqq ? bfqq->pid : 0);
+keep_queue:
+	return bfqq;
+}
+
+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
+		bfq_log_bfqq(bfqd, bfqq,
+			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+			jiffies_to_msecs(bfqq->wr_cur_max_time),
+			bfqq->wr_coeff,
+			bfqq->entity.weight, bfqq->entity.orig_weight);
+
+		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
+		       entity->orig_weight * bfqq->wr_coeff);
+		if (entity->prio_changed)
+			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
+
+		/*
+		 * If the queue was activated in a burst, or
+		 * too much time has elapsed from the beginning
+		 * of this weight-raising period, or the queue has
+		 * exceeded the acceptable number of cooperations,
+		 * then end weight raising.
+		 */
+		if (bfq_bfqq_in_large_burst(bfqq) ||
+		    bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||
+		    time_is_before_jiffies(bfqq->last_wr_start_finish +
+					   bfqq->wr_cur_max_time)) {
+			bfqq->last_wr_start_finish = jiffies;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "wrais ending at %lu, rais_max_time %u",
+				     bfqq->last_wr_start_finish,
+				     jiffies_to_msecs(bfqq->wr_cur_max_time));
+			bfq_bfqq_end_wr(bfqq);
+		}
+	}
+	/* Update weight both if it must be raised and if it must be lowered */
+	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
+		__bfq_entity_update_weight_prio(
+			bfq_entity_service_tree(entity),
+			entity);
+}
+
+/*
+ * Dispatch one request from bfqq, moving it to the request queue
+ * dispatch list.
+ */
+static int bfq_dispatch_request(struct bfq_data *bfqd,
+				struct bfq_queue *bfqq)
+{
+	int dispatched = 0;
+	struct request *rq;
+	unsigned long service_to_charge;
+
+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	/* Follow expired path, else get first next available. */
+	rq = bfq_check_fifo(bfqq);
+	if (!rq)
+		rq = bfqq->next_rq;
+	service_to_charge = bfq_serv_to_charge(rq, bfqq);
+
+	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
+		/*
+		 * This may happen if the next rq is chosen in fifo order
+		 * instead of sector order. The budget is properly
+		 * dimensioned to be always sufficient to serve the next
+		 * request only if it is chosen in sector order. The reason
+		 * is that it would be quite inefficient and little useful
+		 * to always make sure that the budget is large enough to
+		 * serve even the possible next rq in fifo order.
+		 * In fact, requests are seldom served in fifo order.
+		 *
+		 * Expire the queue for budget exhaustion, and make sure
+		 * that the next act_budget is enough to serve the next
+		 * request, even if it comes from the fifo expired path.
+		 */
+		bfqq->next_rq = rq;
+		/*
+		 * Since this dispatch is failed, make sure that
+		 * a new one will be performed
+		 */
+		if (!bfqd->rq_in_driver)
+			bfq_schedule_dispatch(bfqd);
+		goto expire;
+	}
+
+	/* Finally, insert request into driver dispatch list. */
+	bfq_bfqq_served(bfqq, service_to_charge);
+	bfq_dispatch_insert(bfqd->queue, rq);
+
+	bfq_update_wr_data(bfqd, bfqq);
+
+	bfq_log_bfqq(bfqd, bfqq,
+			"dispatched %u sec req (%llu), budg left %d",
+			blk_rq_sectors(rq),
+			(long long unsigned)blk_rq_pos(rq),
+			bfq_bfqq_budget_left(bfqq));
+
+	dispatched++;
+
+	if (!bfqd->in_service_bic) {
+		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
+		bfqd->in_service_bic = RQ_BIC(rq);
+	}
+
+	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
+	    dispatched >= bfqd->bfq_max_budget_async_rq) ||
+	    bfq_class_idle(bfqq)))
+		goto expire;
+
+	return dispatched;
+
+expire:
+	bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);
+	return dispatched;
+}
+
+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
+{
+	int dispatched = 0;
+
+	while (bfqq->next_rq) {
+		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
+		dispatched++;
+	}
+
+	BUG_ON(!list_empty(&bfqq->fifo));
+	return dispatched;
+}
+
+/*
+ * Drain our current requests.
+ * Used for barriers and when switching io schedulers on-the-fly.
+ */
+static int bfq_forced_dispatch(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq, *n;
+	struct bfq_service_tree *st;
+	int dispatched = 0;
+
+	bfqq = bfqd->in_service_queue;
+	if (bfqq)
+		__bfq_bfqq_expire(bfqd, bfqq);
+
+	/*
+	 * Loop through classes, and be careful to leave the scheduler
+	 * in a consistent state, as feedback mechanisms and vtime
+	 * updates cannot be disabled during the process.
+	 */
+	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
+		st = bfq_entity_service_tree(&bfqq->entity);
+
+		dispatched += __bfq_forced_dispatch_bfqq(bfqq);
+		bfqq->max_budget = bfq_max_budget(bfqd);
+
+		bfq_forget_idle(st);
+	}
+
+	BUG_ON(bfqd->busy_queues != 0);
+
+	return dispatched;
+}
+
+static int bfq_dispatch_requests(struct request_queue *q, int force)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq;
+	int max_dispatch;
+
+	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+	if (bfqd->busy_queues == 0)
+		return 0;
+
+	if (unlikely(force))
+		return bfq_forced_dispatch(bfqd);
+
+	bfqq = bfq_select_queue(bfqd);
+	if (!bfqq)
+		return 0;
+
+	if (bfq_class_idle(bfqq))
+		max_dispatch = 1;
+
+	if (!bfq_bfqq_sync(bfqq))
+		max_dispatch = bfqd->bfq_max_budget_async_rq;
+
+	if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {
+		if (bfqd->busy_queues > 1)
+			return 0;
+		if (bfqq->dispatched >= 4 * max_dispatch)
+			return 0;
+	}
+
+	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
+		return 0;
+
+	bfq_clear_bfqq_wait_request(bfqq);
+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
+
+	if (!bfq_dispatch_request(bfqd, bfqq))
+		return 0;
+
+	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
+			bfq_bfqq_sync(bfqq) ? "sync" : "async");
+
+	return 1;
+}
+
+/*
+ * Task holds one reference to the queue, dropped when task exits.  Each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Queue lock must be held here.
+ */
+static void bfq_put_queue(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_group *bfqg = bfqq_group(bfqq);
+#endif
+
+	BUG_ON(atomic_read(&bfqq->ref) <= 0);
+
+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
+		     atomic_read(&bfqq->ref));
+	if (!atomic_dec_and_test(&bfqq->ref))
+		return;
+
+	BUG_ON(rb_first(&bfqq->sort_list));
+	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
+	BUG_ON(bfqq->entity.tree);
+	BUG_ON(bfq_bfqq_busy(bfqq));
+	BUG_ON(bfqd->in_service_queue == bfqq);
+
+	if (bfq_bfqq_sync(bfqq))
+		/*
+		 * The fact that this queue is being destroyed does not
+		 * invalidate the fact that this queue may have been
+		 * activated during the current burst. As a consequence,
+		 * although the queue does not exist anymore, and hence
+		 * needs to be removed from the burst list if there,
+		 * the burst size has not to be decremented.
+		 */
+		hlist_del_init(&bfqq->burst_list_node);
+
+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
+
+	kmem_cache_free(bfq_pool, bfqq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_put(bfqg);
+#endif
+}
+
+static void bfq_put_cooperator(struct bfq_queue *bfqq)
+{
+	struct bfq_queue *__bfqq, *next;
+
+	/*
+	 * If this queue was scheduled to merge with another queue, be
+	 * sure to drop the reference taken on that queue (and others in
+	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
+	 */
+	__bfqq = bfqq->new_bfqq;
+	while (__bfqq) {
+		if (__bfqq == bfqq)
+			break;
+		next = __bfqq->new_bfqq;
+		bfq_put_queue(__bfqq);
+		__bfqq = next;
+	}
+}
+
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	if (bfqq == bfqd->in_service_queue) {
+		__bfq_bfqq_expire(bfqd, bfqq);
+		bfq_schedule_dispatch(bfqd);
+	}
+
+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
+		     atomic_read(&bfqq->ref));
+
+	bfq_put_cooperator(bfqq);
+
+	bfq_put_queue(bfqq);
+}
+
+static void bfq_init_icq(struct io_cq *icq)
+{
+	struct bfq_io_cq *bic = icq_to_bic(icq);
+
+	bic->ttime.last_end_request = jiffies;
+	/*
+	 * A newly created bic indicates that the process has just
+	 * started doing I/O, and is probably mapping into memory its
+	 * executable and libraries: it definitely needs weight raising.
+	 * There is however the possibility that the process performs,
+	 * for a while, I/O close to some other process. EQM intercepts
+	 * this behavior and may merge the queue corresponding to the
+	 * process  with some other queue, BEFORE the weight of the queue
+	 * is raised. Merged queues are not weight-raised (they are assumed
+	 * to belong to processes that benefit only from high throughput).
+	 * If the merge is basically the consequence of an accident, then
+	 * the queue will be split soon and will get back its old weight.
+	 * It is then important to write down somewhere that this queue
+	 * does need weight raising, even if it did not make it to get its
+	 * weight raised before being merged. To this purpose, we overload
+	 * the field raising_time_left and assign 1 to it, to mark the queue
+	 * as needing weight raising.
+	 */
+	bic->wr_time_left = 1;
+}
+
+static void bfq_exit_icq(struct io_cq *icq)
+{
+	struct bfq_io_cq *bic = icq_to_bic(icq);
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+
+	if (bic->bfqq[BLK_RW_ASYNC]) {
+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
+		bic->bfqq[BLK_RW_ASYNC] = NULL;
+	}
+
+	if (bic->bfqq[BLK_RW_SYNC]) {
+		/*
+		 * If the bic is using a shared queue, put the reference
+		 * taken on the io_context when the bic started using a
+		 * shared bfq_queue.
+		 */
+		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
+			put_io_context(icq->ioc);
+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
+		bic->bfqq[BLK_RW_SYNC] = NULL;
+	}
+}
+
+/*
+ * Update the entity prio values; note that the new values will not
+ * be used until the next (re)activation.
+ */
+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+	struct task_struct *tsk = current;
+	int ioprio_class;
+
+	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+	switch (ioprio_class) {
+	default:
+		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
+			"bfq: bad prio class %d\n", ioprio_class);
+	case IOPRIO_CLASS_NONE:
+		/*
+		 * No prio set, inherit CPU scheduling settings.
+		 */
+		bfqq->new_ioprio = task_nice_ioprio(tsk);
+		bfqq->new_ioprio_class = task_nice_ioclass(tsk);
+		break;
+	case IOPRIO_CLASS_RT:
+		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+		bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
+		break;
+	case IOPRIO_CLASS_BE:
+		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+		bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
+		break;
+	case IOPRIO_CLASS_IDLE:
+		bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
+		bfqq->new_ioprio = 7;
+		bfq_clear_bfqq_idle_window(bfqq);
+		break;
+	}
+
+	if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) {
+		printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",
+				 bfqq->new_ioprio);
+		BUG();
+	}
+
+	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+	bfqq->entity.prio_changed = 1;
+}
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd;
+	struct bfq_queue *bfqq, *new_bfqq;
+	unsigned long uninitialized_var(flags);
+	int ioprio = bic->icq.ioc->ioprio;
+
+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
+				   &flags);
+	/*
+	 * This condition may trigger on a newly created bic, be sure to
+	 * drop the lock before returning.
+	 */
+	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
+		goto out;
+
+	bic->ioprio = ioprio;
+
+	bfqq = bic->bfqq[BLK_RW_ASYNC];
+	if (bfqq) {
+		new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic,
+					 GFP_ATOMIC);
+		if (new_bfqq) {
+			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "check_ioprio_change: bfqq %p %d",
+				     bfqq, atomic_read(&bfqq->ref));
+			bfq_put_queue(bfqq);
+		}
+	}
+
+	bfqq = bic->bfqq[BLK_RW_SYNC];
+	if (bfqq)
+		bfq_set_next_ioprio_data(bfqq, bic);
+
+out:
+	bfq_put_bfqd_unlock(bfqd, &flags);
+}
+
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
+{
+	RB_CLEAR_NODE(&bfqq->entity.rb_node);
+	INIT_LIST_HEAD(&bfqq->fifo);
+	INIT_HLIST_NODE(&bfqq->burst_list_node);
+
+	atomic_set(&bfqq->ref, 0);
+	bfqq->bfqd = bfqd;
+
+	if (bic)
+		bfq_set_next_ioprio_data(bfqq, bic);
+
+	if (is_sync) {
+		if (!bfq_class_idle(bfqq))
+			bfq_mark_bfqq_idle_window(bfqq);
+		bfq_mark_bfqq_sync(bfqq);
+	} else
+		bfq_clear_bfqq_sync(bfqq);
+	bfq_mark_bfqq_IO_bound(bfqq);
+
+	/* Tentative initial value to trade off between thr and lat */
+	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
+	bfqq->pid = pid;
+
+	bfqq->wr_coeff = 1;
+	bfqq->last_wr_start_finish = 0;
+	/*
+	 * Set to the value for which bfqq will not be deemed as
+	 * soft rt when it becomes backlogged.
+	 */
+	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
+}
+
+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
+					      struct bio *bio, int is_sync,
+					      struct bfq_io_cq *bic,
+					      gfp_t gfp_mask)
+{
+	struct bfq_group *bfqg;
+	struct bfq_queue *bfqq, *new_bfqq = NULL;
+	struct blkcg *blkcg;
+
+retry:
+	rcu_read_lock();
+
+	blkcg = bio_blkcg(bio);
+	bfqg = bfq_find_alloc_group(bfqd, blkcg);
+	/* bic always exists here */
+	bfqq = bic_to_bfqq(bic, is_sync);
+
+	/*
+	 * Always try a new alloc if we fall back to the OOM bfqq
+	 * originally, since it should just be a temporary situation.
+	 */
+	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
+		bfqq = NULL;
+		if (new_bfqq) {
+			bfqq = new_bfqq;
+			new_bfqq = NULL;
+		} else if (gfpflags_allow_blocking(gfp_mask)) {
+			rcu_read_unlock();
+			spin_unlock_irq(bfqd->queue->queue_lock);
+			new_bfqq = kmem_cache_alloc_node(bfq_pool,
+					gfp_mask | __GFP_ZERO,
+					bfqd->queue->node);
+			spin_lock_irq(bfqd->queue->queue_lock);
+			if (new_bfqq)
+				goto retry;
+		} else {
+			bfqq = kmem_cache_alloc_node(bfq_pool,
+					gfp_mask | __GFP_ZERO,
+					bfqd->queue->node);
+		}
+
+		if (bfqq) {
+			bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+                                      is_sync);
+			bfq_init_entity(&bfqq->entity, bfqg);
+			bfq_log_bfqq(bfqd, bfqq, "allocated");
+		} else {
+			bfqq = &bfqd->oom_bfqq;
+			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
+		}
+	}
+
+	if (new_bfqq)
+		kmem_cache_free(bfq_pool, new_bfqq);
+
+	rcu_read_unlock();
+
+	return bfqq;
+}
+
+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+					       struct bfq_group *bfqg,
+					       int ioprio_class, int ioprio)
+{
+	switch (ioprio_class) {
+	case IOPRIO_CLASS_RT:
+		return &bfqg->async_bfqq[0][ioprio];
+	case IOPRIO_CLASS_NONE:
+		ioprio = IOPRIO_NORM;
+		/* fall through */
+	case IOPRIO_CLASS_BE:
+		return &bfqg->async_bfqq[1][ioprio];
+	case IOPRIO_CLASS_IDLE:
+		return &bfqg->async_idle_bfqq;
+	default:
+		BUG();
+	}
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bio *bio, int is_sync,
+				       struct bfq_io_cq *bic, gfp_t gfp_mask)
+{
+	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+	struct bfq_queue **async_bfqq = NULL;
+	struct bfq_queue *bfqq = NULL;
+
+	if (!is_sync) {
+		struct blkcg *blkcg;
+		struct bfq_group *bfqg;
+
+		rcu_read_lock();
+		blkcg = bio_blkcg(bio);
+		rcu_read_unlock();
+		bfqg = bfq_find_alloc_group(bfqd, blkcg);
+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
+						  ioprio);
+		bfqq = *async_bfqq;
+	}
+
+	if (!bfqq)
+		bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask);
+
+	/*
+	 * Pin the queue now that it's allocated, scheduler exit will
+	 * prune it.
+	 */
+	if (!is_sync && !(*async_bfqq)) {
+		atomic_inc(&bfqq->ref);
+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		*async_bfqq = bfqq;
+	}
+
+	atomic_inc(&bfqq->ref);
+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
+		     atomic_read(&bfqq->ref));
+	return bfqq;
+}
+
+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
+				    struct bfq_io_cq *bic)
+{
+	unsigned long elapsed = jiffies - bic->ttime.last_end_request;
+	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
+
+	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
+	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
+	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
+				bic->ttime.ttime_samples;
+}
+
+static void bfq_update_io_seektime(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq,
+				   struct request *rq)
+{
+	sector_t sdist;
+	u64 total;
+
+	if (bfqq->last_request_pos < blk_rq_pos(rq))
+		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
+	else
+		sdist = bfqq->last_request_pos - blk_rq_pos(rq);
+
+	/*
+	 * Don't allow the seek distance to get too large from the
+	 * odd fragment, pagein, etc.
+	 */
+	if (bfqq->seek_samples == 0) /* first request, not really a seek */
+		sdist = 0;
+	else if (bfqq->seek_samples <= 60) /* second & third seek */
+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
+	else
+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
+
+	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
+	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
+	total = bfqq->seek_total + (bfqq->seek_samples/2);
+	do_div(total, bfqq->seek_samples);
+	bfqq->seek_mean = (sector_t)total;
+
+	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
+			(u64)bfqq->seek_mean);
+}
+
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter.
+ */
+static void bfq_update_idle_window(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq,
+				   struct bfq_io_cq *bic)
+{
+	int enable_idle;
+
+	/* Don't idle for async or idle io prio class. */
+	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
+		return;
+
+	/* Idle window just restored, statistics are meaningless. */
+	if (bfq_bfqq_just_split(bfqq))
+		return;
+
+	enable_idle = bfq_bfqq_idle_window(bfqq);
+
+	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
+	    bfqd->bfq_slice_idle == 0 ||
+		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
+			bfqq->wr_coeff == 1))
+		enable_idle = 0;
+	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
+		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
+			bfqq->wr_coeff == 1)
+			enable_idle = 0;
+		else
+			enable_idle = 1;
+	}
+	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
+		enable_idle);
+
+	if (enable_idle)
+		bfq_mark_bfqq_idle_window(bfqq);
+	else
+		bfq_clear_bfqq_idle_window(bfqq);
+}
+
+/*
+ * Called when a new fs request (rq) is added to bfqq.  Check if there's
+ * something we should do about it.
+ */
+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			    struct request *rq)
+{
+	struct bfq_io_cq *bic = RQ_BIC(rq);
+
+	if (rq->cmd_flags & REQ_META)
+		bfqq->meta_pending++;
+
+	bfq_update_io_thinktime(bfqd, bic);
+	bfq_update_io_seektime(bfqd, bfqq, rq);
+	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {
+		bfq_clear_bfqq_constantly_seeky(bfqq);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);
+			bfqd->const_seeky_busy_in_flight_queues--;
+		}
+	}
+	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
+	    !BFQQ_SEEKY(bfqq))
+		bfq_update_idle_window(bfqd, bfqq, bic);
+	bfq_clear_bfqq_just_split(bfqq);
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
+		     (long long unsigned)bfqq->seek_mean);
+
+	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
+		bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+				 blk_rq_sectors(rq) < 32;
+		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+
+		/*
+		 * There is just this request queued: if the request
+		 * is small and the queue is not to be expired, then
+		 * just exit.
+		 *
+		 * In this way, if the disk is being idled to wait for
+		 * a new request from the in-service queue, we avoid
+		 * unplugging the device and committing the disk to serve
+		 * just a small request. On the contrary, we wait for
+		 * the block layer to decide when to unplug the device:
+		 * hopefully, new requests will be merged to this one
+		 * quickly, then the device will be unplugged and
+		 * larger requests will be dispatched.
+		 */
+		if (small_req && !budget_timeout)
+			return;
+
+		/*
+		 * A large enough request arrived, or the queue is to
+		 * be expired: in both cases disk idling is to be
+		 * stopped, so clear wait_request flag and reset
+		 * timer.
+		 */
+		bfq_clear_bfqq_wait_request(bfqq);
+		del_timer(&bfqd->idle_slice_timer);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		bfqg_stats_update_idle_time(bfqq_group(bfqq));
+#endif
+
+		/*
+		 * The queue is not empty, because a new request just
+		 * arrived. Hence we can safely expire the queue, in
+		 * case of budget timeout, without risking that the
+		 * timestamps of the queue are not updated correctly.
+		 * See [1] for more details.
+		 */
+		if (budget_timeout)
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQ_BFQQ_BUDGET_TIMEOUT);
+
+		/*
+		 * Let the request rip immediately, or let a new queue be
+		 * selected if bfqq has just been expired.
+		 */
+		__blk_run_queue(bfqd->queue);
+	}
+}
+
+static void bfq_insert_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
+
+	assert_spin_locked(bfqd->queue->queue_lock);
+
+	/*
+	 * An unplug may trigger a requeue of a request from the device
+	 * driver: make sure we are in process context while trying to
+	 * merge two bfq_queues.
+	 */
+	if (!in_interrupt()) {
+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+		if (new_bfqq) {
+			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
+				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
+			/*
+			 * Release the request's reference to the old bfqq
+			 * and make sure one is taken to the shared queue.
+			 */
+			new_bfqq->allocated[rq_data_dir(rq)]++;
+			bfqq->allocated[rq_data_dir(rq)]--;
+			atomic_inc(&new_bfqq->ref);
+			bfq_put_queue(bfqq);
+			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
+				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
+						bfqq, new_bfqq);
+			rq->elv.priv[1] = new_bfqq;
+			bfqq = new_bfqq;
+		} else
+			bfq_bfqq_increase_failed_cooperations(bfqq);
+	}
+
+	bfq_add_request(rq);
+
+	/*
+	 * Here a newly-created bfq_queue has already started a weight-raising
+	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()
+	 * from assigning it a full weight-raising period. See the detailed
+	 * comments about this field in bfq_init_icq().
+	 */
+	if (bfqq->bic)
+		bfqq->bic->wr_time_left = 0;
+	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+	list_add_tail(&rq->queuelist, &bfqq->fifo);
+
+	bfq_rq_enqueued(bfqd, bfqq, rq);
+}
+
+static void bfq_update_hw_tag(struct bfq_data *bfqd)
+{
+	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
+				     bfqd->rq_in_driver);
+
+	if (bfqd->hw_tag == 1)
+		return;
+
+	/*
+	 * This sample is valid if the number of outstanding requests
+	 * is large enough to allow a queueing behavior.  Note that the
+	 * sum is not exact, as it's not taking into account deactivated
+	 * requests.
+	 */
+	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+		return;
+
+	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
+		return;
+
+	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
+	bfqd->max_rq_in_driver = 0;
+	bfqd->hw_tag_samples = 0;
+}
+
+static void bfq_completed_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	bool sync = bfq_bfqq_sync(bfqq);
+
+	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",
+		     blk_rq_sectors(rq), sync);
+
+	bfq_update_hw_tag(bfqd);
+
+	BUG_ON(!bfqd->rq_in_driver);
+	BUG_ON(!bfqq->dispatched);
+	bfqd->rq_in_driver--;
+	bfqq->dispatched--;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_update_completion(bfqq_group(bfqq),
+				     rq_start_time_ns(rq),
+				     rq_io_start_time_ns(rq), rq->cmd_flags);
+#endif
+
+	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
+		bfq_weights_tree_remove(bfqd, &bfqq->entity,
+					&bfqd->queue_weights_tree);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			BUG_ON(!bfqd->busy_in_flight_queues);
+			bfqd->busy_in_flight_queues--;
+			if (bfq_bfqq_constantly_seeky(bfqq)) {
+				BUG_ON(!bfqd->
+					const_seeky_busy_in_flight_queues);
+				bfqd->const_seeky_busy_in_flight_queues--;
+			}
+		}
+	}
+
+	if (sync) {
+		bfqd->sync_flight--;
+		RQ_BIC(rq)->ttime.last_end_request = jiffies;
+	}
+
+	/*
+	 * If we are waiting to discover whether the request pattern of the
+	 * task associated with the queue is actually isochronous, and
+	 * both requisites for this condition to hold are satisfied, then
+	 * compute soft_rt_next_start (see the comments to the function
+	 * bfq_bfqq_softrt_next_start()).
+	 */
+	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
+	    RB_EMPTY_ROOT(&bfqq->sort_list))
+		bfqq->soft_rt_next_start =
+			bfq_bfqq_softrt_next_start(bfqd, bfqq);
+
+	/*
+	 * If this is the in-service queue, check if it needs to be expired,
+	 * or if we want to idle in case it has no pending requests.
+	 */
+	if (bfqd->in_service_queue == bfqq) {
+		if (bfq_bfqq_budget_new(bfqq))
+			bfq_set_budget_timeout(bfqd);
+
+		if (bfq_bfqq_must_idle(bfqq)) {
+			bfq_arm_slice_timer(bfqd);
+			goto out;
+		} else if (bfq_may_expire_for_budg_timeout(bfqq))
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQ_BFQQ_BUDGET_TIMEOUT);
+		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
+			 (bfqq->dispatched == 0 ||
+			  !bfq_bfqq_may_idle(bfqq)))
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQ_BFQQ_NO_MORE_REQUESTS);
+	}
+
+	if (!bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+
+out:
+	return;
+}
+
+static int __bfq_may_queue(struct bfq_queue *bfqq)
+{
+	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
+		bfq_clear_bfqq_must_alloc(bfqq);
+		return ELV_MQUEUE_MUST;
+	}
+
+	return ELV_MQUEUE_MAY;
+}
+
+static int bfq_may_queue(struct request_queue *q, int rw)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct task_struct *tsk = current;
+	struct bfq_io_cq *bic;
+	struct bfq_queue *bfqq;
+
+	/*
+	 * Don't force setup of a queue from here, as a call to may_queue
+	 * does not necessarily imply that a request actually will be
+	 * queued. So just lookup a possibly existing queue, or return
+	 * 'may queue' if that fails.
+	 */
+	bic = bfq_bic_lookup(bfqd, tsk->io_context);
+	if (!bic)
+		return ELV_MQUEUE_MAY;
+
+	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
+	if (bfqq)
+		return __bfq_may_queue(bfqq);
+
+	return ELV_MQUEUE_MAY;
+}
+
+/*
+ * Queue lock held here.
+ */
+static void bfq_put_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	if (bfqq) {
+		const int rw = rq_data_dir(rq);
+
+		BUG_ON(!bfqq->allocated[rw]);
+		bfqq->allocated[rw]--;
+
+		rq->elv.priv[0] = NULL;
+		rq->elv.priv[1] = NULL;
+
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		bfq_put_queue(bfqq);
+	}
+}
+
+/*
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
+ * was the last process referring to said bfqq.
+ */
+static struct bfq_queue *
+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
+
+	put_io_context(bic->icq.ioc);
+
+	if (bfqq_process_refs(bfqq) == 1) {
+		bfqq->pid = current->pid;
+		bfq_clear_bfqq_coop(bfqq);
+		bfq_clear_bfqq_split_coop(bfqq);
+		return bfqq;
+	}
+
+	bic_set_bfqq(bic, NULL, 1);
+
+	bfq_put_cooperator(bfqq);
+
+	bfq_put_queue(bfqq);
+	return NULL;
+}
+
+/*
+ * Allocate bfq data structures associated with this request.
+ */
+static int bfq_set_request(struct request_queue *q, struct request *rq,
+			   struct bio *bio, gfp_t gfp_mask)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
+	const int rw = rq_data_dir(rq);
+	const int is_sync = rq_is_sync(rq);
+	struct bfq_queue *bfqq;
+	unsigned long flags;
+	bool split = false;
+
+	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+
+	bfq_check_ioprio_change(bic, bio);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	if (!bic)
+		goto queue_fail;
+
+	bfq_bic_update_cgroup(bic, bio);
+
+new_queue:
+	bfqq = bic_to_bfqq(bic, is_sync);
+	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
+		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);
+		bic_set_bfqq(bic, bfqq, is_sync);
+		if (split && is_sync) {
+			if ((bic->was_in_burst_list && bfqd->large_burst) ||
+			    bic->saved_in_large_burst)
+				bfq_mark_bfqq_in_large_burst(bfqq);
+			else {
+			    bfq_clear_bfqq_in_large_burst(bfqq);
+			    if (bic->was_in_burst_list)
+			       hlist_add_head(&bfqq->burst_list_node,
+				              &bfqd->burst_list);
+			}
+		}
+	} else {
+		/* If the queue was seeky for too long, break it apart. */
+		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
+			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+			bfqq = bfq_split_bfqq(bic, bfqq);
+			split = true;
+			if (!bfqq)
+				goto new_queue;
+		}
+	}
+
+	bfqq->allocated[rw]++;
+	atomic_inc(&bfqq->ref);
+	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
+		     atomic_read(&bfqq->ref));
+
+	rq->elv.priv[0] = bic;
+	rq->elv.priv[1] = bfqq;
+
+	/*
+	 * If a bfq_queue has only one process reference, it is owned
+	 * by only one bfq_io_cq: we can set the bic field of the
+	 * bfq_queue to the address of that structure. Also, if the
+	 * queue has just been split, mark a flag so that the
+	 * information is available to the other scheduler hooks.
+	 */
+	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
+		bfqq->bic = bic;
+		if (split) {
+			bfq_mark_bfqq_just_split(bfqq);
+			/*
+			 * If the queue has just been split from a shared
+			 * queue, restore the idle window and the possible
+			 * weight raising period.
+			 */
+			bfq_bfqq_resume_state(bfqq, bic);
+		}
+	}
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return 0;
+
+queue_fail:
+	bfq_schedule_dispatch(bfqd);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return 1;
+}
+
+static void bfq_kick_queue(struct work_struct *work)
+{
+	struct bfq_data *bfqd =
+		container_of(work, struct bfq_data, unplug_work);
+	struct request_queue *q = bfqd->queue;
+
+	spin_lock_irq(q->queue_lock);
+	__blk_run_queue(q);
+	spin_unlock_irq(q->queue_lock);
+}
+
+/*
+ * Handler of the expiration of the timer running if the in-service queue
+ * is idling inside its time slice.
+ */
+static void bfq_idle_slice_timer(unsigned long data)
+{
+	struct bfq_data *bfqd = (struct bfq_data *)data;
+	struct bfq_queue *bfqq;
+	unsigned long flags;
+	enum bfqq_expiration reason;
+
+	spin_lock_irqsave(bfqd->queue->queue_lock, flags);
+
+	bfqq = bfqd->in_service_queue;
+	/*
+	 * Theoretical race here: the in-service queue can be NULL or
+	 * different from the queue that was idling if the timer handler
+	 * spins on the queue_lock and a new request arrives for the
+	 * current queue and there is a full dispatch cycle that changes
+	 * the in-service queue.  This can hardly happen, but in the worst
+	 * case we just expire a queue too early.
+	 */
+	if (bfqq) {
+		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
+		if (bfq_bfqq_budget_timeout(bfqq))
+			/*
+			 * Also here the queue can be safely expired
+			 * for budget timeout without wasting
+			 * guarantees
+			 */
+			reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
+			/*
+			 * The queue may not be empty upon timer expiration,
+			 * because we may not disable the timer when the
+			 * first request of the in-service queue arrives
+			 * during disk idling.
+			 */
+			reason = BFQ_BFQQ_TOO_IDLE;
+		else
+			goto schedule_dispatch;
+
+		bfq_bfqq_expire(bfqd, bfqq, true, reason);
+	}
+
+schedule_dispatch:
+	bfq_schedule_dispatch(bfqd);
+
+	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
+}
+
+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
+{
+	del_timer_sync(&bfqd->idle_slice_timer);
+	cancel_work_sync(&bfqd->unplug_work);
+}
+
+static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+					struct bfq_queue **bfqq_ptr)
+{
+	struct bfq_group *root_group = bfqd->root_group;
+	struct bfq_queue *bfqq = *bfqq_ptr;
+
+	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
+	if (bfqq) {
+		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
+		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		bfq_put_queue(bfqq);
+		*bfqq_ptr = NULL;
+	}
+}
+
+/*
+ * Release all the bfqg references to its async queues.  If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure until all the requests on a device are gone).
+ */
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
+
+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
+}
+
+static void bfq_exit_queue(struct elevator_queue *e)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	struct request_queue *q = bfqd->queue;
+	struct bfq_queue *bfqq, *n;
+
+	bfq_shutdown_timer_wq(bfqd);
+
+	spin_lock_irq(q->queue_lock);
+
+	BUG_ON(bfqd->in_service_queue);
+	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
+		bfq_deactivate_bfqq(bfqd, bfqq, 0);
+
+	spin_unlock_irq(q->queue_lock);
+
+	bfq_shutdown_timer_wq(bfqd);
+
+	synchronize_rcu();
+
+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_deactivate_policy(q, &blkcg_policy_bfq);
+#else
+	kfree(bfqd->root_group);
+#endif
+
+	kfree(bfqd);
+}
+
+static void bfq_init_root_group(struct bfq_group *root_group,
+				struct bfq_data *bfqd)
+{
+	int i;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	root_group->entity.parent = NULL;
+	root_group->my_entity = NULL;
+	root_group->bfqd = bfqd;
+#endif
+	root_group->rq_pos_tree = RB_ROOT;
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+}
+
+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+	struct bfq_data *bfqd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
+
+	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
+	if (!bfqd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	eq->elevator_data = bfqd;
+
+	/*
+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+	 * Grab a permanent reference to it, so that the normal code flow
+	 * will not attempt to free it.
+	 */
+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
+	atomic_inc(&bfqd->oom_bfqq.ref);
+	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+	bfqd->oom_bfqq.entity.new_weight =
+		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+	/*
+	 * Trigger weight initialization, according to ioprio, at the
+	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+	 * class won't be changed any more.
+	 */
+	bfqd->oom_bfqq.entity.prio_changed = 1;
+
+	bfqd->queue = q;
+
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
+
+	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
+	if (!bfqd->root_group)
+		goto out_free;
+	bfq_init_root_group(bfqd->root_group, bfqd);
+	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqd->active_numerous_groups = 0;
+#endif
+
+	init_timer(&bfqd->idle_slice_timer);
+	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+	bfqd->idle_slice_timer.data = (unsigned long)bfqd;
+
+	bfqd->queue_weights_tree = RB_ROOT;
+	bfqd->group_weights_tree = RB_ROOT;
+
+	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
+
+	INIT_LIST_HEAD(&bfqd->active_list);
+	INIT_LIST_HEAD(&bfqd->idle_list);
+	INIT_HLIST_HEAD(&bfqd->burst_list);
+
+	bfqd->hw_tag = -1;
+
+	bfqd->bfq_max_budget = bfq_default_max_budget;
+
+	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
+	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
+	bfqd->bfq_back_max = bfq_back_max;
+	bfqd->bfq_back_penalty = bfq_back_penalty;
+	bfqd->bfq_slice_idle = bfq_slice_idle;
+	bfqd->bfq_class_idle_last_service = 0;
+	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
+	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
+	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
+
+	bfqd->bfq_coop_thresh = 2;
+	bfqd->bfq_failed_cooperations = 7000;
+	bfqd->bfq_requests_within_timer = 120;
+
+	bfqd->bfq_large_burst_thresh = 11;
+	bfqd->bfq_burst_interval = msecs_to_jiffies(500);
+
+	bfqd->low_latency = true;
+
+	bfqd->bfq_wr_coeff = 20;
+	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
+	bfqd->bfq_wr_max_time = 0;
+	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
+	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
+	bfqd->bfq_wr_max_softrt_rate = 7000; /*
+					      * Approximate rate required
+					      * to playback or record a
+					      * high-definition compressed
+					      * video.
+					      */
+	bfqd->wr_busy_queues = 0;
+	bfqd->busy_in_flight_queues = 0;
+	bfqd->const_seeky_busy_in_flight_queues = 0;
+
+	/*
+	 * Begin by assuming, optimistically, that the device peak rate is
+	 * equal to the highest reference rate.
+	 */
+	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
+			T_fast[blk_queue_nonrot(bfqd->queue)];
+	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];
+	bfqd->device_speed = BFQ_BFQD_FAST;
+
+	return 0;
+
+out_free:
+	kfree(bfqd);
+	kobject_put(&eq->kobj);
+	return -ENOMEM;
+}
+
+static void bfq_slab_kill(void)
+{
+	if (bfq_pool)
+		kmem_cache_destroy(bfq_pool);
+}
+
+static int __init bfq_slab_setup(void)
+{
+	bfq_pool = KMEM_CACHE(bfq_queue, 0);
+	if (!bfq_pool)
+		return -ENOMEM;
+	return 0;
+}
+
+static ssize_t bfq_var_show(unsigned int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t bfq_var_store(unsigned long *var, const char *page,
+			     size_t count)
+{
+	unsigned long new_val;
+	int ret = kstrtoul(page, 10, &new_val);
+
+	if (ret == 0)
+		*var = new_val;
+
+	return count;
+}
+
+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
+		       jiffies_to_msecs(bfqd->bfq_wr_max_time) :
+		       jiffies_to_msecs(bfq_wr_duration(bfqd)));
+}
+
+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
+{
+	struct bfq_queue *bfqq;
+	struct bfq_data *bfqd = e->elevator_data;
+	ssize_t num_char = 0;
+
+	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
+			    bfqd->queued);
+
+	spin_lock_irq(bfqd->queue->queue_lock);
+
+	num_char += sprintf(page + num_char, "Active:\n");
+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
+	  num_char += sprintf(page + num_char,
+			      "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",
+			      bfqq->pid,
+			      bfqq->entity.weight,
+			      bfqq->queued[0],
+			      bfqq->queued[1],
+			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+			jiffies_to_msecs(bfqq->wr_cur_max_time));
+	}
+
+	num_char += sprintf(page + num_char, "Idle:\n");
+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
+			num_char += sprintf(page + num_char,
+				"pid%d: weight %hu, dur %d/%u\n",
+				bfqq->pid,
+				bfqq->entity.weight,
+				jiffies_to_msecs(jiffies -
+					bfqq->last_wr_start_finish),
+				jiffies_to_msecs(bfqq->wr_cur_max_time));
+	}
+
+	spin_unlock_irq(bfqd->queue->queue_lock);
+
+	return num_char;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned int __data = __VAR;					\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return bfq_var_show(__data, (page));				\
+}
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
+SHOW_FUNCTION(bfq_max_budget_async_rq_show,
+	      bfqd->bfq_max_budget_async_rq, 0);
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
+	1);
+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+static ssize_t								\
+__FUNC(struct elevator_queue *e, const char *page, size_t count)	\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned long uninitialized_var(__data);			\
+	int ret = bfq_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
+	return ret;							\
+}
+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
+		INT_MAX, 0);
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
+		1, INT_MAX, 0);
+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
+		1);
+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
+		&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
+		INT_MAX, 0);
+#undef STORE_FUNCTION
+
+/* do nothing for the moment */
+static ssize_t bfq_weights_store(struct elevator_queue *e,
+				    const char *page, size_t count)
+{
+	return count;
+}
+
+static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
+{
+	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
+
+	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
+		return bfq_calc_max_budget(bfqd->peak_rate, timeout);
+	else
+		return bfq_default_max_budget;
+}
+
+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
+				    const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data == 0)
+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+	else {
+		if (__data > INT_MAX)
+			__data = INT_MAX;
+		bfqd->bfq_max_budget = __data;
+	}
+
+	bfqd->bfq_user_max_budget = __data;
+
+	return ret;
+}
+
+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
+				      const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data < 1)
+		__data = 1;
+	else if (__data > INT_MAX)
+		__data = INT_MAX;
+
+	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
+	if (bfqd->bfq_user_max_budget == 0)
+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+
+	return ret;
+}
+
+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
+				     const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data > 1)
+		__data = 1;
+	if (__data == 0 && bfqd->low_latency != 0)
+		bfq_end_wr(bfqd);
+	bfqd->low_latency = __data;
+
+	return ret;
+}
+
+#define BFQ_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
+
+static struct elv_fs_entry bfq_attrs[] = {
+	BFQ_ATTR(fifo_expire_sync),
+	BFQ_ATTR(fifo_expire_async),
+	BFQ_ATTR(back_seek_max),
+	BFQ_ATTR(back_seek_penalty),
+	BFQ_ATTR(slice_idle),
+	BFQ_ATTR(max_budget),
+	BFQ_ATTR(max_budget_async_rq),
+	BFQ_ATTR(timeout_sync),
+	BFQ_ATTR(timeout_async),
+	BFQ_ATTR(low_latency),
+	BFQ_ATTR(wr_coeff),
+	BFQ_ATTR(wr_max_time),
+	BFQ_ATTR(wr_rt_max_time),
+	BFQ_ATTR(wr_min_idle_time),
+	BFQ_ATTR(wr_min_inter_arr_async),
+	BFQ_ATTR(wr_max_softrt_rate),
+	BFQ_ATTR(weights),
+	__ATTR_NULL
+};
+
+static struct elevator_type iosched_bfq = {
+	.ops = {
+		.elevator_merge_fn =		bfq_merge,
+		.elevator_merged_fn =		bfq_merged_request,
+		.elevator_merge_req_fn =	bfq_merged_requests,
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		.elevator_bio_merged_fn =	bfq_bio_merged,
+#endif
+		.elevator_allow_merge_fn =	bfq_allow_merge,
+		.elevator_dispatch_fn =		bfq_dispatch_requests,
+		.elevator_add_req_fn =		bfq_insert_request,
+		.elevator_activate_req_fn =	bfq_activate_request,
+		.elevator_deactivate_req_fn =	bfq_deactivate_request,
+		.elevator_completed_req_fn =	bfq_completed_request,
+		.elevator_former_req_fn =	elv_rb_former_request,
+		.elevator_latter_req_fn =	elv_rb_latter_request,
+		.elevator_init_icq_fn =		bfq_init_icq,
+		.elevator_exit_icq_fn =		bfq_exit_icq,
+		.elevator_set_req_fn =		bfq_set_request,
+		.elevator_put_req_fn =		bfq_put_request,
+		.elevator_may_queue_fn =	bfq_may_queue,
+		.elevator_init_fn =		bfq_init_queue,
+		.elevator_exit_fn =		bfq_exit_queue,
+	},
+	.icq_size =		sizeof(struct bfq_io_cq),
+	.icq_align =		__alignof__(struct bfq_io_cq),
+	.elevator_attrs =	bfq_attrs,
+	.elevator_name =	"bfq",
+	.elevator_owner =	THIS_MODULE,
+};
+
+static int __init bfq_init(void)
+{
+	int ret;
+
+	/*
+	 * Can be 0 on HZ < 1000 setups.
+	 */
+	if (bfq_slice_idle == 0)
+		bfq_slice_idle = 1;
+
+	if (bfq_timeout_async == 0)
+		bfq_timeout_async = 1;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	ret = blkcg_policy_register(&blkcg_policy_bfq);
+	if (ret)
+		return ret;
+#endif
+
+	ret = -ENOMEM;
+	if (bfq_slab_setup())
+		goto err_pol_unreg;
+
+	/*
+	 * Times to load large popular applications for the typical systems
+	 * installed on the reference devices (see the comments before the
+	 * definitions of the two arrays).
+	 */
+	T_slow[0] = msecs_to_jiffies(2600);
+	T_slow[1] = msecs_to_jiffies(1000);
+	T_fast[0] = msecs_to_jiffies(5500);
+	T_fast[1] = msecs_to_jiffies(2000);
+
+	/*
+	 * Thresholds that determine the switch between speed classes (see
+	 * the comments before the definition of the array).
+	 */
+	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;
+	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;
+
+	ret = elv_register(&iosched_bfq);
+	if (ret)
+		goto err_pol_unreg;
+
+	pr_info("BFQ I/O-scheduler: v7r11");
+
+	return 0;
+
+err_pol_unreg:
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+	return ret;
+}
+
+static void __exit bfq_exit(void)
+{
+	elv_unregister(&iosched_bfq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+	bfq_slab_kill();
+}
+
+module_init(bfq_init);
+module_exit(bfq_exit);
+
+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente");
+MODULE_LICENSE("GPL");
diff --git b/block/bfq-sched.c b/block/bfq-sched.c
new file mode 100644
index 0000000..a64fec1
--- /dev/null
+++ b/block/bfq-sched.c
@@ -0,0 +1,1200 @@
+/*
+ * BFQ: Hierarchical B-WF2Q+ scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ */
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#define for_each_entity(entity)	\
+	for (; entity ; entity = entity->parent)
+
+#define for_each_entity_safe(entity, parent) \
+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
+
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 int extract,
+						 struct bfq_data *bfqd);
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
+static void bfq_update_budget(struct bfq_entity *next_in_service)
+{
+	struct bfq_entity *bfqg_entity;
+	struct bfq_group *bfqg;
+	struct bfq_sched_data *group_sd;
+
+	BUG_ON(!next_in_service);
+
+	group_sd = next_in_service->sched_data;
+
+	bfqg = container_of(group_sd, struct bfq_group, sched_data);
+	/*
+	 * bfq_group's my_entity field is not NULL only if the group
+	 * is not the root group. We must not touch the root entity
+	 * as it must never become an in-service entity.
+	 */
+	bfqg_entity = bfqg->my_entity;
+	if (bfqg_entity)
+		bfqg_entity->budget = next_in_service->budget;
+}
+
+static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+{
+	struct bfq_entity *next_in_service;
+
+	if (sd->in_service_entity)
+		/* will update/requeue at the end of service */
+		return 0;
+
+	/*
+	 * NOTE: this can be improved in many ways, such as returning
+	 * 1 (and thus propagating upwards the update) only when the
+	 * budget changes, or caching the bfqq that will be scheduled
+	 * next from this subtree.  By now we worry more about
+	 * correctness than about performance...
+	 */
+	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
+	sd->next_in_service = next_in_service;
+
+	if (next_in_service)
+		bfq_update_budget(next_in_service);
+
+	return 1;
+}
+
+static void bfq_check_next_in_service(struct bfq_sched_data *sd,
+				      struct bfq_entity *entity)
+{
+	BUG_ON(sd->next_in_service != entity);
+}
+#else
+#define for_each_entity(entity)	\
+	for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+	for (parent = NULL; entity ; entity = parent)
+
+static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+{
+	return 0;
+}
+
+static void bfq_check_next_in_service(struct bfq_sched_data *sd,
+				      struct bfq_entity *entity)
+{
+}
+
+static void bfq_update_budget(struct bfq_entity *next_in_service)
+{
+}
+#endif
+
+/*
+ * Shift for timestamp calculations.  This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time
+ * wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT	22
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = NULL;
+
+	BUG_ON(!entity);
+
+	if (!entity->my_sched_data)
+		bfqq = container_of(entity, struct bfq_queue, entity);
+
+	return bfqq;
+}
+
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor (weight of an entity or weight sum).
+ */
+static u64 bfq_delta(unsigned long service, unsigned long weight)
+{
+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
+
+	do_div(d, weight);
+	return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	BUG_ON(entity->weight == 0);
+
+	entity->finish = entity->start +
+		bfq_delta(service, entity->weight);
+
+	if (bfqq) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: serv %lu, w %d",
+			service, entity->weight);
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: start %llu, finish %llu, delta %llu",
+			entity->start, entity->finish,
+			bfq_delta(service, entity->weight));
+	}
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity.  This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+static struct bfq_entity *bfq_entity_of(struct rb_node *node)
+{
+	struct bfq_entity *entity = NULL;
+
+	if (node)
+		entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
+{
+	BUG_ON(entity->tree != root);
+
+	entity->tree = NULL;
+	rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct bfq_service_tree *st,
+			     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *next;
+
+	BUG_ON(entity->tree != &st->idle);
+
+	if (entity == st->first_idle) {
+		next = rb_next(&entity->rb_node);
+		st->first_idle = bfq_entity_of(next);
+	}
+
+	if (entity == st->last_idle) {
+		next = rb_prev(&entity->rb_node);
+		st->last_idle = bfq_entity_of(next);
+	}
+
+	bfq_extract(&st->idle, entity);
+
+	if (bfqq)
+		list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
+{
+	struct bfq_entity *entry;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	BUG_ON(entity->tree);
+
+	while (*node) {
+		parent = *node;
+		entry = rb_entry(parent, struct bfq_entity, rb_node);
+
+		if (bfq_gt(entry->finish, entity->finish))
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+
+	rb_link_node(&entity->rb_node, parent, node);
+	rb_insert_color(&entity->rb_node, root);
+
+	entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree.  The function  assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
+{
+	struct bfq_entity *child;
+
+	if (node) {
+		child = rb_entry(node, struct bfq_entity, rb_node);
+		if (bfq_gt(entity->min_start, child->min_start))
+			entity->min_start = child->min_start;
+	}
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value.  The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static void bfq_update_active_node(struct rb_node *node)
+{
+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	entity->min_start = entity->start;
+	bfq_update_min(entity, node->rb_right);
+	bfq_update_min(entity, node->rb_left);
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update.  This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root.  The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+	struct rb_node *parent;
+
+up:
+	bfq_update_active_node(node);
+
+	parent = rb_parent(node);
+	if (!parent)
+		return;
+
+	if (node == parent->rb_left && parent->rb_right)
+		bfq_update_active_node(parent->rb_right);
+	else if (parent->rb_left)
+		bfq_update_active_node(parent->rb_left);
+
+	node = parent;
+	goto up;
+}
+
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+				 struct bfq_entity *entity,
+				 struct rb_root *root);
+
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+				    struct bfq_entity *entity,
+				    struct rb_root *root);
+
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its
+ *                     group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct bfq_service_tree *st,
+			      struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
+
+	bfq_insert(&st->active, entity);
+
+	if (node->rb_left)
+		node = node->rb_left;
+	else if (node->rb_right)
+		node = node->rb_right;
+
+	bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	BUG_ON(!bfqg);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+	if (bfqq)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else { /* bfq_group */
+		BUG_ON(!bfqd);
+		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
+	}
+	if (bfqg != bfqd->root_group) {
+		BUG_ON(!bfqg);
+		BUG_ON(!bfqd);
+		bfqg->active_entities++;
+		if (bfqg->active_entities == 2)
+			bfqd->active_numerous_groups++;
+	}
+#endif
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+static unsigned short bfq_ioprio_to_weight(int ioprio)
+{
+	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
+	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio;
+}
+
+/**
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
+ * @weight: the weight value to convert.
+ *
+ * To preserve as much as possible the old only-ioprio user interface,
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ */
+static unsigned short bfq_weight_to_ioprio(int weight)
+{
+	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
+	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?
+		0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;
+}
+
+static void bfq_get_entity(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	if (bfqq) {
+		atomic_inc(&bfqq->ref);
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
+			     bfqq, atomic_read(&bfqq->ref));
+	}
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch.  If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+	struct rb_node *deepest;
+
+	if (!node->rb_right && !node->rb_left)
+		deepest = rb_parent(node);
+	else if (!node->rb_right)
+		deepest = node->rb_left;
+	else if (!node->rb_left)
+		deepest = node->rb_right;
+	else {
+		deepest = rb_next(node);
+		if (deepest->rb_right)
+			deepest = deepest->rb_right;
+		else if (rb_parent(deepest) != node)
+			deepest = rb_parent(deepest);
+	}
+
+	return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct bfq_service_tree *st,
+			       struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
+
+	node = bfq_find_deepest(&entity->rb_node);
+	bfq_extract(&st->active, entity);
+
+	if (node)
+		bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	BUG_ON(!bfqg);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+	if (bfqq)
+		list_del(&bfqq->bfqq_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else { /* bfq_group */
+		BUG_ON(!bfqd);
+		bfq_weights_tree_remove(bfqd, entity,
+					&bfqd->group_weights_tree);
+	}
+	if (bfqg != bfqd->root_group) {
+		BUG_ON(!bfqg);
+		BUG_ON(!bfqd);
+		BUG_ON(!bfqg->active_entities);
+		bfqg->active_entities--;
+		if (bfqg->active_entities == 1) {
+			BUG_ON(!bfqd->active_numerous_groups);
+			bfqd->active_numerous_groups--;
+		}
+	}
+#endif
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct bfq_service_tree *st,
+			    struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
+		st->first_idle = entity;
+	if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
+		st->last_idle = entity;
+
+	bfq_insert(&st->idle, entity);
+
+	if (bfqq)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - remove an entity from the wfq trees.
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ *
+ * Update the device status and forget everything about @entity, putting
+ * the device reference to it, if it is a queue.  Entities belonging to
+ * groups are not refcounted.
+ */
+static void bfq_forget_entity(struct bfq_service_tree *st,
+			      struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct bfq_sched_data *sd;
+
+	BUG_ON(!entity->on_st);
+
+	entity->on_st = 0;
+	st->wsum -= entity->weight;
+	if (bfqq) {
+		sd = entity->sched_data;
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		bfq_put_queue(bfqq);
+	}
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+static void bfq_put_idle_entity(struct bfq_service_tree *st,
+				struct bfq_entity *entity)
+{
+	bfq_idle_extract(st, entity);
+	bfq_forget_entity(st, entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+static void bfq_forget_idle(struct bfq_service_tree *st)
+{
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
+	    !bfq_gt(last_idle->finish, st->vtime)) {
+		/*
+		 * Forget the whole idle tree, increasing the vtime past
+		 * the last finish time of idle entities.
+		 */
+		st->vtime = last_idle->finish;
+	}
+
+	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
+		bfq_put_idle_entity(st, first_idle);
+}
+
+static struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+			 struct bfq_entity *entity)
+{
+	struct bfq_service_tree *new_st = old_st;
+
+	if (entity->prio_changed) {
+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+		unsigned short prev_weight, new_weight;
+		struct bfq_data *bfqd = NULL;
+		struct rb_root *root;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		struct bfq_sched_data *sd;
+		struct bfq_group *bfqg;
+#endif
+
+		if (bfqq)
+			bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			sd = entity->my_sched_data;
+			bfqg = container_of(sd, struct bfq_group, sched_data);
+			BUG_ON(!bfqg);
+			bfqd = (struct bfq_data *)bfqg->bfqd;
+			BUG_ON(!bfqd);
+		}
+#endif
+
+		BUG_ON(old_st->wsum < entity->weight);
+		old_st->wsum -= entity->weight;
+
+		if (entity->new_weight != entity->orig_weight) {
+			if (entity->new_weight < BFQ_MIN_WEIGHT ||
+			    entity->new_weight > BFQ_MAX_WEIGHT) {
+				printk(KERN_CRIT "update_weight_prio: "
+						 "new_weight %d\n",
+					entity->new_weight);
+				BUG();
+			}
+			entity->orig_weight = entity->new_weight;
+			if (bfqq)
+				bfqq->ioprio =
+				  bfq_weight_to_ioprio(entity->orig_weight);
+		}
+
+		if (bfqq)
+			bfqq->ioprio_class = bfqq->new_ioprio_class;
+		entity->prio_changed = 0;
+
+		/*
+		 * NOTE: here we may be changing the weight too early,
+		 * this will cause unfairness.  The correct approach
+		 * would have required additional complexity to defer
+		 * weight changes to the proper time instants (i.e.,
+		 * when entity->finish <= old_st->vtime).
+		 */
+		new_st = bfq_entity_service_tree(entity);
+
+		prev_weight = entity->weight;
+		new_weight = entity->orig_weight *
+			     (bfqq ? bfqq->wr_coeff : 1);
+		/*
+		 * If the weight of the entity changes, remove the entity
+		 * from its old weight counter (if there is a counter
+		 * associated with the entity), and add it to the counter
+		 * associated with its new weight.
+		 */
+		if (prev_weight != new_weight) {
+			root = bfqq ? &bfqd->queue_weights_tree :
+				      &bfqd->group_weights_tree;
+			bfq_weights_tree_remove(bfqd, entity, root);
+		}
+		entity->weight = new_weight;
+		/*
+		 * Add the entity to its weights tree only if it is
+		 * not associated with a weight-raised queue.
+		 */
+		if (prev_weight != new_weight &&
+		    (bfqq ? bfqq->wr_coeff == 1 : 1))
+			/* If we get here, root has been initialized. */
+			bfq_weights_tree_add(bfqd, entity, root);
+
+		new_st->wsum += entity->weight;
+
+		if (new_st != old_st)
+			entity->start = new_st->vtime;
+	}
+
+	return new_st;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+#endif
+
+/**
+ * bfq_bfqq_served - update the scheduler status after selection for
+ *                   service.
+ * @bfqq: the queue being served.
+ * @served: bytes to transfer.
+ *
+ * NOTE: this can be optimized, as the timestamps of upper level entities
+ * are synchronized every time a new bfqq is selected for service.  By now,
+ * we keep it to better check consistency.
+ */
+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st;
+
+	for_each_entity(entity) {
+		st = bfq_entity_service_tree(entity);
+
+		entity->service += served;
+		BUG_ON(entity->service > entity->budget);
+		BUG_ON(st->wsum == 0);
+
+		st->vtime += bfq_delta(served, st->wsum);
+		bfq_forget_idle(st);
+	}
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
+#endif
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
+}
+
+/**
+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
+ * @bfqq: the queue that needs a service update.
+ *
+ * When it's not possible to be fair in the service domain, because
+ * a queue is not consuming its budget fast enough (the meaning of
+ * fast depends on the timeout parameter), we charge it a full
+ * budget.  In this way we should obtain a sort of time-domain
+ * fairness among all the seeky/slow queues.
+ */
+static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
+
+	bfq_bfqq_served(bfqq, entity->budget - entity->service);
+}
+
+/**
+ * __bfq_activate_entity - activate an entity.
+ * @entity: the entity being activated.
+ *
+ * Called whenever an entity is activated, i.e., it is not active and one
+ * of its children receives a new request, or has to be reactivated due to
+ * budget exhaustion.  It uses the current budget of the entity (and the
+ * service received if @entity is active) of the queue to calculate its
+ * timestamps.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (entity == sd->in_service_entity) {
+		BUG_ON(entity->tree);
+		/*
+		 * If we are requeueing the current entity we have
+		 * to take care of not charging to it service it has
+		 * not received.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		sd->in_service_entity = NULL;
+	} else if (entity->tree == &st->active) {
+		/*
+		 * Requeueing an entity due to a change of some
+		 * next_in_service entity below it.  We reuse the
+		 * old start time.
+		 */
+		bfq_active_extract(st, entity);
+	} else if (entity->tree == &st->idle) {
+		/*
+		 * Must be on the idle tree, bfq_idle_extract() will
+		 * check for that.
+		 */
+		bfq_idle_extract(st, entity);
+		entity->start = bfq_gt(st->vtime, entity->finish) ?
+				       st->vtime : entity->finish;
+	} else {
+		/*
+		 * The finish time of the entity may be invalid, and
+		 * it is in the past for sure, otherwise the queue
+		 * would have been on the idle tree.
+		 */
+		entity->start = st->vtime;
+		st->wsum += entity->weight;
+		bfq_get_entity(entity);
+
+		BUG_ON(entity->on_st);
+		entity->on_st = 1;
+	}
+
+	st = __bfq_entity_update_weight_prio(st, entity);
+	bfq_calc_finish(entity, entity->budget);
+	bfq_active_insert(st, entity);
+}
+
+/**
+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
+ * @entity: the entity to activate.
+ *
+ * Activate @entity and all the entities on the path from it to the root.
+ */
+static void bfq_activate_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd;
+
+	for_each_entity(entity) {
+		__bfq_activate_entity(entity);
+
+		sd = entity->sched_data;
+		if (!bfq_update_next_in_service(sd))
+			/*
+			 * No need to propagate the activation to the
+			 * upper entities, as they will be updated when
+			 * the in-service entity is rescheduled.
+			 */
+			break;
+	}
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @requeue: if false, the entity will not be put into the idle tree.
+ *
+ * Deactivate an entity, independently from its previous state.  If the
+ * entity was not on a service tree just return, otherwise if it is on
+ * any scheduler tree, extract it from that tree, and if necessary
+ * and if the caller did not specify @requeue, put it on the idle tree.
+ *
+ * Return %1 if the caller should update the entity hierarchy, i.e.,
+ * if the entity was in service or if it was the next_in_service for
+ * its sched_data; return %0 otherwise.
+ */
+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st;
+	int was_in_service;
+	int ret = 0;
+
+	if (sd == NULL || !entity->on_st) /* never activated, or inactive */
+		return 0;
+
+	st = bfq_entity_service_tree(entity);
+	was_in_service = entity == sd->in_service_entity;
+
+	BUG_ON(was_in_service && entity->tree);
+
+	if (was_in_service) {
+		bfq_calc_finish(entity, entity->service);
+		sd->in_service_entity = NULL;
+	} else if (entity->tree == &st->active)
+		bfq_active_extract(st, entity);
+	else if (entity->tree == &st->idle)
+		bfq_idle_extract(st, entity);
+	else if (entity->tree)
+		BUG();
+
+	if (was_in_service || sd->next_in_service == entity)
+		ret = bfq_update_next_in_service(sd);
+
+	if (!requeue || !bfq_gt(entity->finish, st->vtime))
+		bfq_forget_entity(st, entity);
+	else
+		bfq_idle_insert(st, entity);
+
+	BUG_ON(sd->in_service_entity == entity);
+	BUG_ON(sd->next_in_service == entity);
+
+	return ret;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity.
+ * @entity: the entity to deactivate.
+ * @requeue: true if the entity can be put on the idle tree
+ */
+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+{
+	struct bfq_sched_data *sd;
+	struct bfq_entity *parent;
+
+	for_each_entity_safe(entity, parent) {
+		sd = entity->sched_data;
+
+		if (!__bfq_deactivate_entity(entity, requeue))
+			/*
+			 * The parent entity is still backlogged, and
+			 * we don't need to update it as it is still
+			 * in service.
+			 */
+			break;
+
+		if (sd->next_in_service)
+			/*
+			 * The parent entity is still backlogged and
+			 * the budgets on the path towards the root
+			 * need to be updated.
+			 */
+			goto update;
+
+		/*
+		 * If we reach there the parent is no more backlogged and
+		 * we want to propagate the dequeue upwards.
+		 */
+		requeue = 1;
+	}
+
+	return;
+
+update:
+	entity = parent;
+	for_each_entity(entity) {
+		__bfq_activate_entity(entity);
+
+		sd = entity->sched_data;
+		if (!bfq_update_next_in_service(sd))
+			break;
+	}
+}
+
+/**
+ * bfq_update_vtime - update vtime if necessary.
+ * @st: the service tree to act upon.
+ *
+ * If necessary update the service tree vtime to have at least one
+ * eligible entity, skipping to its start time.  Assumes that the
+ * active tree of the device is not empty.
+ *
+ * NOTE: this hierarchical implementation updates vtimes quite often,
+ * we may end up with reactivated processes getting timestamps after a
+ * vtime skip done because we needed a ->first_active entity on some
+ * intermediate node.
+ */
+static void bfq_update_vtime(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entry;
+	struct rb_node *node = st->active.rb_node;
+
+	entry = rb_entry(node, struct bfq_entity, rb_node);
+	if (bfq_gt(entry->min_start, st->vtime)) {
+		st->vtime = entry->min_start;
+		bfq_forget_idle(st);
+	}
+}
+
+/**
+ * bfq_first_active_entity - find the eligible entity with
+ *                           the smallest finish time
+ * @st: the service tree to select from.
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start >= vtime) entity. The path on
+ * the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entry, *first = NULL;
+	struct rb_node *node = st->active.rb_node;
+
+	while (node) {
+		entry = rb_entry(node, struct bfq_entity, rb_node);
+left:
+		if (!bfq_gt(entry->start, st->vtime))
+			first = entry;
+
+		BUG_ON(bfq_gt(entry->min_start, st->vtime));
+
+		if (node->rb_left) {
+			entry = rb_entry(node->rb_left,
+					 struct bfq_entity, rb_node);
+			if (!bfq_gt(entry->min_start, st->vtime)) {
+				node = node->rb_left;
+				goto left;
+			}
+		}
+		if (first)
+			break;
+		node = node->rb_right;
+	}
+
+	BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));
+	return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * Update the virtual time in @st and return the first eligible entity
+ * it contains.
+ */
+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
+						   bool force)
+{
+	struct bfq_entity *entity, *new_next_in_service = NULL;
+
+	if (RB_EMPTY_ROOT(&st->active))
+		return NULL;
+
+	bfq_update_vtime(st);
+	entity = bfq_first_active_entity(st);
+	BUG_ON(bfq_gt(entity->start, st->vtime));
+
+	/*
+	 * If the chosen entity does not match with the sched_data's
+	 * next_in_service and we are forcedly serving the IDLE priority
+	 * class tree, bubble up budget update.
+	 */
+	if (unlikely(force && entity != entity->sched_data->next_in_service)) {
+		new_next_in_service = entity;
+		for_each_entity(new_next_in_service)
+			bfq_update_budget(new_next_in_service);
+	}
+
+	return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ * @extract: if true the returned entity will be also extracted from @sd.
+ *
+ * NOTE: since we cache the next_in_service entity at each level of the
+ * hierarchy, the complexity of the lookup can be decreased with
+ * absolutely no effort just returning the cached next_in_service value;
+ * we prefer to do full lookups to test the consistency of * the data
+ * structures.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 int extract,
+						 struct bfq_data *bfqd)
+{
+	struct bfq_service_tree *st = sd->service_tree;
+	struct bfq_entity *entity;
+	int i = 0;
+
+	BUG_ON(sd->in_service_entity);
+
+	if (bfqd &&
+	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
+		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
+						  true);
+		if (entity) {
+			i = BFQ_IOPRIO_CLASSES - 1;
+			bfqd->bfq_class_idle_last_service = jiffies;
+			sd->next_in_service = entity;
+		}
+	}
+	for (; i < BFQ_IOPRIO_CLASSES; i++) {
+		entity = __bfq_lookup_next_entity(st + i, false);
+		if (entity) {
+			if (extract) {
+				bfq_check_next_in_service(sd, entity);
+				bfq_active_extract(st + i, entity);
+				sd->in_service_entity = entity;
+				sd->next_in_service = NULL;
+			}
+			break;
+		}
+	}
+
+	return entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+	struct bfq_entity *entity = NULL;
+	struct bfq_sched_data *sd;
+	struct bfq_queue *bfqq;
+
+	BUG_ON(bfqd->in_service_queue);
+
+	if (bfqd->busy_queues == 0)
+		return NULL;
+
+	sd = &bfqd->root_group->sched_data;
+	for (; sd ; sd = entity->my_sched_data) {
+		entity = bfq_lookup_next_entity(sd, 1, bfqd);
+		BUG_ON(!entity);
+		entity->service = 0;
+	}
+
+	bfqq = bfq_entity_to_bfqq(entity);
+	BUG_ON(!bfqq);
+
+	return bfqq;
+}
+
+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+	if (bfqd->in_service_bic) {
+		put_io_context(bfqd->in_service_bic->icq.ioc);
+		bfqd->in_service_bic = NULL;
+	}
+
+	bfqd->in_service_queue = NULL;
+	del_timer(&bfqd->idle_slice_timer);
+}
+
+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				int requeue)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	if (bfqq == bfqd->in_service_queue)
+		__bfq_bfqd_reset_in_service(bfqd);
+
+	bfq_deactivate_entity(entity, requeue);
+}
+
+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_entity(entity);
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+#endif
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree.
+ */
+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			      int requeue)
+{
+	BUG_ON(!bfq_bfqq_busy(bfqq));
+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+	bfq_clear_bfqq_busy(bfqq);
+
+	BUG_ON(bfqd->busy_queues == 0);
+	bfqd->busy_queues--;
+
+	if (!bfqq->dispatched) {
+		bfq_weights_tree_remove(bfqd, &bfqq->entity,
+					&bfqd->queue_weights_tree);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			BUG_ON(!bfqd->busy_in_flight_queues);
+			bfqd->busy_in_flight_queues--;
+			if (bfq_bfqq_constantly_seeky(bfqq)) {
+				BUG_ON(!bfqd->
+					const_seeky_busy_in_flight_queues);
+				bfqd->const_seeky_busy_in_flight_queues--;
+			}
+		}
+	}
+	if (bfqq->wr_coeff > 1)
+		bfqd->wr_busy_queues--;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_update_dequeue(bfqq_group(bfqq));
+#endif
+
+	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	BUG_ON(bfq_bfqq_busy(bfqq));
+	BUG_ON(bfqq == bfqd->in_service_queue);
+
+	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+	bfq_activate_bfqq(bfqd, bfqq);
+
+	bfq_mark_bfqq_busy(bfqq);
+	bfqd->busy_queues++;
+
+	if (!bfqq->dispatched) {
+		if (bfqq->wr_coeff == 1)
+			bfq_weights_tree_add(bfqd, &bfqq->entity,
+					     &bfqd->queue_weights_tree);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			bfqd->busy_in_flight_queues++;
+			if (bfq_bfqq_constantly_seeky(bfqq))
+				bfqd->const_seeky_busy_in_flight_queues++;
+		}
+	}
+	if (bfqq->wr_coeff > 1)
+		bfqd->wr_busy_queues++;
+}
diff --git b/block/bfq.h b/block/bfq.h
new file mode 100644
index 0000000..32dfcee
--- /dev/null
+++ b/block/bfq.h
@@ -0,0 +1,867 @@
+/*
+ * BFQ-v7r11 for 4.4.0: data structures and common functions prototypes.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ */
+
+#ifndef _BFQ_H
+#define _BFQ_H
+
+#include <linux/blktrace_api.h>
+#include <linux/hrtimer.h>
+#include <linux/ioprio.h>
+#include <linux/rbtree.h>
+#include <linux/blk-cgroup.h>
+
+#define BFQ_IOPRIO_CLASSES	3
+#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)
+
+#define BFQ_MIN_WEIGHT			1
+#define BFQ_MAX_WEIGHT			1000
+#define BFQ_WEIGHT_CONVERSION_COEFF	10
+
+#define BFQ_DEFAULT_QUEUE_IOPRIO	4
+
+#define BFQ_DEFAULT_GRP_WEIGHT	10
+#define BFQ_DEFAULT_GRP_IOPRIO	0
+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
+
+struct bfq_entity;
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ * @active: tree for active entities (i.e., those backlogged).
+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
+ * @first_idle: idle entity with minimum F_i.
+ * @last_idle: idle entity with maximum F_i.
+ * @vtime: scheduler virtual time.
+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree.  All the fields are protected by the queue lock
+ * of the containing bfqd.
+ */
+struct bfq_service_tree {
+	struct rb_root active;
+	struct rb_root idle;
+
+	struct bfq_entity *first_idle;
+	struct bfq_entity *last_idle;
+
+	u64 vtime;
+	unsigned long wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ * @in_service_entity: entity in service.
+ * @next_in_service: head-of-the-line entity in the scheduler.
+ * @service_tree: array of service trees, one per ioprio_class.
+ *
+ * bfq_sched_data is the basic scheduler queue.  It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as
+ * an intermediate queue on a hierarchical setup.
+ * @next_in_service points to the active entity of the sched_data
+ * service trees that will be scheduled next.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_sched_data {
+	struct bfq_entity *in_service_entity;
+	struct bfq_entity *next_in_service;
+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+};
+
+/**
+ * struct bfq_weight_counter - counter of the number of all active entities
+ *                             with a given weight.
+ * @weight: weight of the entities that this counter refers to.
+ * @num_active: number of active entities with this weight.
+ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree
+ *                and @group_weights_tree).
+ */
+struct bfq_weight_counter {
+	short int weight;
+	unsigned int num_active;
+	struct rb_node weights_node;
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ * @rb_node: service_tree member.
+ * @weight_counter: pointer to the weight counter associated with this entity.
+ * @on_st: flag, true if the entity is on a tree (either the active or
+ *         the idle one of its service_tree).
+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
+ * @start: B-WF2Q+ start timestamp (aka S_i).
+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
+ * @min_start: minimum start time of the (active) subtree rooted at
+ *             this entity; used for O(log N) lookups into active trees.
+ * @service: service received during the last round of service.
+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
+ * @weight: weight of the queue
+ * @parent: parent entity, for hierarchical scheduling.
+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
+ *                 associated scheduler queue, %NULL on leaf nodes.
+ * @sched_data: the scheduler queue this entity belongs to.
+ * @ioprio: the ioprio in use.
+ * @new_weight: when a weight change is requested, the new weight value.
+ * @orig_weight: original weight, used to implement weight boosting
+ * @prio_changed: flag, true when the user requested a weight, ioprio or
+ *		  ioprio_class change.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy.  Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would
+ * allow different weights on different devices, but this
+ * functionality is not exported to userspace by now.  Priorities and
+ * weights are updated lazily, first storing the new values into the
+ * new_* fields, then setting the @prio_changed flag.  As soon as
+ * there is a transition in the entity state that allows the priority
+ * update to take place the effective and the requested priority
+ * values are synchronized.
+ *
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ.  When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed.  All the fields are protected by the queue lock of the
+ * containing bfqd.
+ */
+struct bfq_entity {
+	struct rb_node rb_node;
+	struct bfq_weight_counter *weight_counter;
+
+	int on_st;
+
+	u64 finish;
+	u64 start;
+
+	struct rb_root *tree;
+
+	u64 min_start;
+
+	int service, budget;
+	unsigned short weight, new_weight;
+	unsigned short orig_weight;
+
+	struct bfq_entity *parent;
+
+	struct bfq_sched_data *my_sched_data;
+	struct bfq_sched_data *sched_data;
+
+	int prio_changed;
+};
+
+struct bfq_group;
+
+/**
+ * struct bfq_queue - leaf schedulable entity.
+ * @ref: reference counter.
+ * @bfqd: parent bfq_data.
+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
+ * @ioprio_class: the ioprio_class in use.
+ * @new_ioprio_class: when an ioprio_class change is requested, the new
+ *                    ioprio_class value.
+ * @new_bfqq: shared bfq_queue if queue is cooperating with
+ *           one or more other queues.
+ * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree).
+ * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree).
+ * @sort_list: sorted list of pending requests.
+ * @next_rq: if fifo isn't expired, next request to serve.
+ * @queued: nr of requests queued in @sort_list.
+ * @allocated: currently allocated requests.
+ * @meta_pending: pending metadata requests.
+ * @fifo: fifo list of requests in sort_list.
+ * @entity: entity representing this queue in the scheduler.
+ * @max_budget: maximum budget allowed from the feedback mechanism.
+ * @budget_timeout: budget expiration (in jiffies).
+ * @dispatched: number of requests on the dispatch list or inside driver.
+ * @flags: status flags.
+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
+ * @burst_list_node: node for the device's burst list.
+ * @seek_samples: number of seeks sampled
+ * @seek_total: sum of the distances of the seeks sampled
+ * @seek_mean: mean seek distance
+ * @last_request_pos: position of the last request enqueued
+ * @requests_within_timer: number of consecutive pairs of request completion
+ *                         and arrival, such that the queue becomes idle
+ *                         after the completion, but the next request arrives
+ *                         within an idle time slice; used only if the queue's
+ *                         IO_bound has been cleared.
+ * @pid: pid of the process owning the queue, used for logging purposes.
+ * @last_wr_start_finish: start time of the current weight-raising period if
+ *                        the @bfq-queue is being weight-raised, otherwise
+ *                        finish time of the last weight-raising period
+ * @wr_cur_max_time: current max raising time for this queue
+ * @soft_rt_next_start: minimum time instant such that, only if a new
+ *                      request is enqueued after this time instant in an
+ *                      idle @bfq_queue with no outstanding requests, then
+ *                      the task associated with the queue it is deemed as
+ *                      soft real-time (see the comments to the function
+ *                      bfq_bfqq_softrt_next_start())
+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
+ *                      idle to backlogged
+ * @service_from_backlogged: cumulative service received from the @bfq_queue
+ *                           since the last transition from idle to
+ *                           backlogged
+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
+ *	 queue is shared
+ *
+ * A bfq_queue is a leaf request queue; it can be associated with an
+ * io_context or more, if it  is  async or shared  between  cooperating
+ * processes. @cgroup holds a reference to the cgroup, to be sure that it
+ * does not disappear while a bfqq still references it (mostly to avoid
+ * races between request issuing and task migration followed by cgroup
+ * destruction).
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_queue {
+	atomic_t ref;
+	struct bfq_data *bfqd;
+
+	unsigned short ioprio, new_ioprio;
+	unsigned short ioprio_class, new_ioprio_class;
+
+	/* fields for cooperating queues handling */
+	struct bfq_queue *new_bfqq;
+	struct rb_node pos_node;
+	struct rb_root *pos_root;
+
+	struct rb_root sort_list;
+	struct request *next_rq;
+	int queued[2];
+	int allocated[2];
+	int meta_pending;
+	struct list_head fifo;
+
+	struct bfq_entity entity;
+
+	int max_budget;
+	unsigned long budget_timeout;
+
+	int dispatched;
+
+	unsigned int flags;
+
+	struct list_head bfqq_list;
+
+	struct hlist_node burst_list_node;
+
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+	sector_t last_request_pos;
+
+	unsigned int requests_within_timer;
+
+	pid_t pid;
+	struct bfq_io_cq *bic;
+
+	/* weight-raising fields */
+	unsigned long wr_cur_max_time;
+	unsigned long soft_rt_next_start;
+	unsigned long last_wr_start_finish;
+	unsigned int wr_coeff;
+	unsigned long last_idle_bklogged;
+	unsigned long service_from_backlogged;
+};
+
+/**
+ * struct bfq_ttime - per process thinktime stats.
+ * @ttime_total: total process thinktime
+ * @ttime_samples: number of thinktime samples
+ * @ttime_mean: average process thinktime
+ */
+struct bfq_ttime {
+	unsigned long last_end_request;
+
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ * @icq: associated io_cq structure
+ * @bfqq: array of two process queues, the sync and the async
+ * @ttime: associated @bfq_ttime struct
+ * @ioprio: per (request_queue, blkcg) ioprio.
+ * @blkcg_id: id of the blkcg the related io_cq belongs to.
+ * @wr_time_left: snapshot of the time left before weight raising ends
+ *                for the sync queue associated to this process; this
+ *		  snapshot is taken to remember this value while the weight
+ *		  raising is suspended because the queue is merged with a
+ *		  shared queue, and is used to set @raising_cur_max_time
+ *		  when the queue is split from the shared queue and its
+ *		  weight is raised again
+ * @saved_idle_window: same purpose as the previous field for the idle
+ *                     window
+ * @saved_IO_bound: same purpose as the previous two fields for the I/O
+ *                  bound classification of a queue
+ * @saved_in_large_burst: same purpose as the previous fields for the
+ *                        value of the field keeping the queue's belonging
+ *                        to a large burst
+ * @was_in_burst_list: true if the queue belonged to a burst list
+ *                     before its merge with another cooperating queue
+ * @cooperations: counter of consecutive successful queue merges underwent
+ *                by any of the process' @bfq_queues
+ * @failed_cooperations: counter of consecutive failed queue merges of any
+ *                       of the process' @bfq_queues
+ */
+struct bfq_io_cq {
+	struct io_cq icq; /* must be the first member */
+	struct bfq_queue *bfqq[2];
+	struct bfq_ttime ttime;
+	int ioprio;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	uint64_t blkcg_id; /* the current blkcg ID */
+#endif
+
+	unsigned int wr_time_left;
+	bool saved_idle_window;
+	bool saved_IO_bound;
+
+	bool saved_in_large_burst;
+	bool was_in_burst_list;
+
+	unsigned int cooperations;
+	unsigned int failed_cooperations;
+};
+
+enum bfq_device_speed {
+	BFQ_BFQD_FAST,
+	BFQ_BFQD_SLOW,
+};
+
+/**
+ * struct bfq_data - per device data structure.
+ * @queue: request queue for the managed device.
+ * @root_group: root bfq_group for the device.
+ * @active_numerous_groups: number of bfq_groups containing more than one
+ *                          active @bfq_entity.
+ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by
+ *                      weight. Used to keep track of whether all @bfq_queues
+ *                     have the same weight. The tree contains one counter
+ *                     for each distinct weight associated to some active
+ *                     and not weight-raised @bfq_queue (see the comments to
+ *                      the functions bfq_weights_tree_[add|remove] for
+ *                     further details).
+ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted
+ *                      by weight. Used to keep track of whether all
+ *                     @bfq_groups have the same weight. The tree contains
+ *                     one counter for each distinct weight associated to
+ *                     some active @bfq_group (see the comments to the
+ *                     functions bfq_weights_tree_[add|remove] for further
+ *                     details).
+ * @busy_queues: number of bfq_queues containing requests (including the
+ *		 queue in service, even if it is idling).
+ * @busy_in_flight_queues: number of @bfq_queues containing pending or
+ *                         in-flight requests, plus the @bfq_queue in
+ *                         service, even if idle but waiting for the
+ *                         possible arrival of its next sync request. This
+ *                         field is updated only if the device is rotational,
+ *                         but used only if the device is also NCQ-capable.
+ *                         The reason why the field is updated also for non-
+ *                         NCQ-capable rotational devices is related to the
+ *                         fact that the value of @hw_tag may be set also
+ *                         later than when busy_in_flight_queues may need to
+ *                         be incremented for the first time(s). Taking also
+ *                         this possibility into account, to avoid unbalanced
+ *                         increments/decrements, would imply more overhead
+ *                         than just updating busy_in_flight_queues
+ *                         regardless of the value of @hw_tag.
+ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues
+ *                                     (that is, seeky queues that expired
+ *                                     for budget timeout at least once)
+ *                                     containing pending or in-flight
+ *                                     requests, including the in-service
+ *                                     @bfq_queue if constantly seeky. This
+ *                                     field is updated only if the device
+ *                                     is rotational, but used only if the
+ *                                     device is also NCQ-capable (see the
+ *                                     comments to @busy_in_flight_queues).
+ * @wr_busy_queues: number of weight-raised busy @bfq_queues.
+ * @queued: number of queued requests.
+ * @rq_in_driver: number of requests dispatched and waiting for completion.
+ * @sync_flight: number of sync requests in the driver.
+ * @max_rq_in_driver: max number of reqs in driver in the last
+ *                    @hw_tag_samples completed requests.
+ * @hw_tag_samples: nr of samples used to calculate hw_tag.
+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
+ * @budgets_assigned: number of budgets assigned.
+ * @idle_slice_timer: timer set when idling for the next sequential request
+ *                    from the queue in service.
+ * @unplug_work: delayed work to restart dispatching on the request queue.
+ * @in_service_queue: bfq_queue in service.
+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
+ * @last_position: on-disk position of the last served request.
+ * @last_budget_start: beginning of the last budget.
+ * @last_idling_start: beginning of the last idle slice.
+ * @peak_rate: peak transfer rate observed for a budget.
+ * @peak_rate_samples: number of samples used to calculate @peak_rate.
+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before
+ *                  rescheduling.
+ * @active_list: list of all the bfq_queues active on the device.
+ * @idle_list: list of all the bfq_queues idle on the device.
+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
+ *                   requests are served in fifo order.
+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
+ * @bfq_back_max: maximum allowed backward seek.
+ * @bfq_slice_idle: maximum idling time.
+ * @bfq_user_max_budget: user-configured max budget value
+ *                       (0 for auto-tuning).
+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
+ *                           async queues.
+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
+ *               to prevent seeky queues to impose long latencies to well
+ *               behaved ones (this also implies that seeky queues cannot
+ *               receive guarantees in the service domain; after a timeout
+ *               they are charged for the whole allocated budget, to try
+ *               to preserve a behavior reasonably fair among them, but
+ *               without service-domain guarantees).
+ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is
+ *                   no more granted any weight-raising.
+ * @bfq_failed_cooperations: number of consecutive failed cooperation
+ *                           chances after which weight-raising is restored
+ *                           to a queue subject to more than bfq_coop_thresh
+ *                           queue merges.
+ * @bfq_requests_within_timer: number of consecutive requests that must be
+ *                             issued within the idle time slice to set
+ *                             again idling to a queue which was marked as
+ *                             non-I/O-bound (see the definition of the
+ *                             IO_bound flag for further details).
+ * @last_ins_in_burst: last time at which a queue entered the current
+ *                     burst of queues being activated shortly after
+ *                     each other; for more details about this and the
+ *                     following parameters related to a burst of
+ *                     activations, see the comments to the function
+ *                     @bfq_handle_burst.
+ * @bfq_burst_interval: reference time interval used to decide whether a
+ *                      queue has been activated shortly after
+ *                      @last_ins_in_burst.
+ * @burst_size: number of queues in the current burst of queue activations.
+ * @bfq_large_burst_thresh: maximum burst size above which the current
+ * 			    queue-activation burst is deemed as 'large'.
+ * @large_burst: true if a large queue-activation burst is in progress.
+ * @burst_list: head of the burst list (as for the above fields, more details
+ * 		in the comments to the function bfq_handle_burst).
+ * @low_latency: if set to true, low-latency heuristics are enabled.
+ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised
+ *                queue is multiplied.
+ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).
+ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.
+ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising
+ *			  may be reactivated for a queue (in jiffies).
+ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals
+ *				after which weight-raising may be
+ *				reactivated for an already busy queue
+ *				(in jiffies).
+ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,
+ *			    sectors per seconds.
+ * @RT_prod: cached value of the product R*T used for computing the maximum
+ *	     duration of the weight raising automatically.
+ * @device_speed: device-speed class for the low-latency heuristic.
+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.
+ *
+ * All the fields are protected by the @queue lock.
+ */
+struct bfq_data {
+	struct request_queue *queue;
+
+	struct bfq_group *root_group;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	int active_numerous_groups;
+#endif
+
+	struct rb_root queue_weights_tree;
+	struct rb_root group_weights_tree;
+
+	int busy_queues;
+	int busy_in_flight_queues;
+	int const_seeky_busy_in_flight_queues;
+	int wr_busy_queues;
+	int queued;
+	int rq_in_driver;
+	int sync_flight;
+
+	int max_rq_in_driver;
+	int hw_tag_samples;
+	int hw_tag;
+
+	int budgets_assigned;
+
+	struct timer_list idle_slice_timer;
+	struct work_struct unplug_work;
+
+	struct bfq_queue *in_service_queue;
+	struct bfq_io_cq *in_service_bic;
+
+	sector_t last_position;
+
+	ktime_t last_budget_start;
+	ktime_t last_idling_start;
+	int peak_rate_samples;
+	u64 peak_rate;
+	int bfq_max_budget;
+
+	struct list_head active_list;
+	struct list_head idle_list;
+
+	unsigned int bfq_fifo_expire[2];
+	unsigned int bfq_back_penalty;
+	unsigned int bfq_back_max;
+	unsigned int bfq_slice_idle;
+	u64 bfq_class_idle_last_service;
+
+	int bfq_user_max_budget;
+	int bfq_max_budget_async_rq;
+	unsigned int bfq_timeout[2];
+
+	unsigned int bfq_coop_thresh;
+	unsigned int bfq_failed_cooperations;
+	unsigned int bfq_requests_within_timer;
+
+	unsigned long last_ins_in_burst;
+	unsigned long bfq_burst_interval;
+	int burst_size;
+	unsigned long bfq_large_burst_thresh;
+	bool large_burst;
+	struct hlist_head burst_list;
+
+	bool low_latency;
+
+	/* parameters of the low_latency heuristics */
+	unsigned int bfq_wr_coeff;
+	unsigned int bfq_wr_max_time;
+	unsigned int bfq_wr_rt_max_time;
+	unsigned int bfq_wr_min_idle_time;
+	unsigned long bfq_wr_min_inter_arr_async;
+	unsigned int bfq_wr_max_softrt_rate;
+	u64 RT_prod;
+	enum bfq_device_speed device_speed;
+
+	struct bfq_queue oom_bfqq;
+};
+
+enum bfqq_state_flags {
+	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */
+	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */
+	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
+	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
+	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */
+	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */
+	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */
+	BFQ_BFQQ_FLAG_IO_bound,		/*
+					 * bfqq has timed-out at least once
+					 * having consumed at most 2/10 of
+					 * its budget
+					 */
+	BFQ_BFQQ_FLAG_in_large_burst,	/*
+					 * bfqq activated in a large burst,
+					 * see comments to bfq_handle_burst.
+					 */
+	BFQ_BFQQ_FLAG_constantly_seeky,	/*
+					 * bfqq has proved to be slow and
+					 * seeky until budget timeout
+					 */
+	BFQ_BFQQ_FLAG_softrt_update,	/*
+					 * may need softrt-next-start
+					 * update
+					 */
+	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */
+	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */
+	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */
+};
+
+#define BFQ_BFQQ_FNS(name)						\
+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
+{									\
+	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\
+}									\
+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)		\
+{									\
+	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\
+}									\
+static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
+{									\
+	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
+}
+
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(must_alloc);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(budget_new);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(constantly_seeky);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(just_split);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS
+
+/* Logging facilities. */
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
+
+#define bfq_log(bfqd, fmt, args...) \
+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+
+/* Expiration reasons. */
+enum bfqq_expiration {
+	BFQ_BFQQ_TOO_IDLE = 0,		/*
+					 * queue has been idling for
+					 * too long
+					 */
+	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */
+	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */
+	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+struct bfqg_stats {
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
+	/* number of ios merged */
+	struct blkg_rwstat		merged;
+	/* total time spent on device in ns, may not be accurate w/ queueing */
+	struct blkg_rwstat		service_time;
+	/* total time spent waiting in scheduler queue in ns */
+	struct blkg_rwstat		wait_time;
+	/* number of IOs queued up */
+	struct blkg_rwstat		queued;
+	/* total sectors transferred */
+	struct blkg_stat		sectors;
+	/* total disk time and nr sectors dispatched by this group */
+	struct blkg_stat		time;
+	/* time not charged to this cgroup */
+	struct blkg_stat		unaccounted_time;
+	/* sum of number of ios queued across all samples */
+	struct blkg_stat		avg_queue_size_sum;
+	/* count of samples taken for average */
+	struct blkg_stat		avg_queue_size_samples;
+	/* how many times this group has been removed from service tree */
+	struct blkg_stat		dequeue;
+	/* total time spent waiting for it to be assigned a timeslice. */
+	struct blkg_stat		group_wait_time;
+	/* time spent idling for this blkcg_gq */
+	struct blkg_stat		idle_time;
+	/* total time with empty current active q with other requests queued */
+	struct blkg_stat		empty_time;
+	/* fields after this shouldn't be cleared on stat reset */
+	uint64_t			start_group_wait_time;
+	uint64_t			start_idle_time;
+	uint64_t			start_empty_time;
+	uint16_t			flags;
+};
+
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+	/* must be the first member */
+	struct blkcg_policy_data pd;
+
+	unsigned short weight;
+};
+
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ *              both bfq_queues and bfq_groups).
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ *              the group, one queue per ioprio value per ioprio_class,
+ *              except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ *             to avoid too many special cases during group creation/
+ *             migration.
+ * @active_entities: number of active entities belonging to the group;
+ *                   unused for the root group. Used to know whether there
+ *                   are groups with more than one active @bfq_entity
+ *                   (see the comments to the function
+ *                   bfq_bfqq_must_not_expire()).
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ *               determining if two or more queues have interleaving
+ *               requests (see bfq_find_close_cooperator()).
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ *    o @bfqd is protected by the queue lock, RCU is used to access it
+ *      from the readers.
+ *    o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+	/* must be the first member */
+	struct blkg_policy_data pd;
+
+	struct bfq_entity entity;
+	struct bfq_sched_data sched_data;
+
+	void *bfqd;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct bfq_entity *my_entity;
+
+	int active_entities;
+
+	struct rb_root rq_pos_tree;
+
+	struct bfqg_stats stats;
+	struct bfqg_stats dead_stats;	/* stats pushed from dead children */
+};
+
+#else
+struct bfq_group {
+	struct bfq_sched_data sched_data;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct rb_root rq_pos_tree;
+};
+#endif
+
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+
+static struct bfq_service_tree *
+bfq_entity_service_tree(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sched_data = entity->sched_data;
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
+				  BFQ_DEFAULT_GRP_CLASS;
+
+	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
+	BUG_ON(sched_data == NULL);
+
+	return sched_data->service_tree + idx;
+}
+
+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+	return bic->bfqq[is_sync];
+}
+
+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
+			 bool is_sync)
+{
+	bic->bfqq[is_sync] = bfqq;
+}
+
+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+{
+	return bic->icq.q->elevator->elevator_data;
+}
+
+/**
+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
+ * @ptr: a pointer to a bfqd.
+ * @flags: storage for the flags to be saved.
+ *
+ * This function allows bfqg->bfqd to be protected by the
+ * queue lock of the bfqd they reference; the pointer is dereferenced
+ * under RCU, so the storage for bfqd is assured to be safe as long
+ * as the RCU read side critical section does not end.  After the
+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
+ * sure that no other writer accessed it.  If we raced with a writer,
+ * the function returns NULL, with the queue unlocked, otherwise it
+ * returns the dereferenced pointer, with the queue locked.
+ */
+static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags)
+{
+	struct bfq_data *bfqd;
+
+	rcu_read_lock();
+	bfqd = rcu_dereference(*(struct bfq_data **)ptr);
+
+	if (bfqd != NULL) {
+		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
+		if (ptr == NULL)
+			printk(KERN_CRIT "get_bfqd_locked pointer NULL\n");
+		else if (*ptr == bfqd)
+			goto out;
+		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
+	}
+
+	bfqd = NULL;
+out:
+	rcu_read_unlock();
+	return bfqd;
+}
+
+static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags)
+{
+	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+
+	if (!group_entity)
+		group_entity = &bfqq->bfqd->root_group->entity;
+
+	return container_of(group_entity, struct bfq_group, entity);
+}
+
+#else
+
+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+	return bfqq->bfqd->root_group;
+}
+
+#endif
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
+static void bfq_put_queue(struct bfq_queue *bfqq);
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bio *bio, int is_sync,
+				       struct bfq_io_cq *bic, gfp_t gfp_mask);
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+				    struct bfq_group *bfqg);
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
+#endif /* _BFQ_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 45f4d7e..f98581f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -48,6 +48,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
 
 DEFINE_IDA(blk_queue_ida);
 
+int trap_non_toi_io;
+
 /*
  * For the allocated request tables
  */
@@ -2104,6 +2106,9 @@ blk_qc_t submit_bio(int rw, struct bio *bio)
 {
 	bio->bi_rw |= rw;
 
+	if (unlikely(trap_non_toi_io))
+		BUG_ON(!bio_flagged(bio, BIO_TOI));
+
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
diff --git a/block/genhd.c b/block/genhd.c
index 9f42526..7a6a655 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,8 @@
 #include <linux/kobj_map.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
+#include <linux/ctype.h>
+#include <linux/fs_uuid.h>
 #include <linux/log2.h>
 #include <linux/pm_runtime.h>
 #include <linux/badblocks.h>
@@ -1417,6 +1419,85 @@ int invalidate_partition(struct gendisk *disk, int partno)
 
 EXPORT_SYMBOL(invalidate_partition);
 
+dev_t blk_lookup_fs_info(struct fs_info *seek)
+{
+	dev_t devt = MKDEV(0, 0);
+	struct class_dev_iter iter;
+	struct device *dev;
+	int best_score = 0;
+
+	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+	while (best_score < 3 && (dev = class_dev_iter_next(&iter))) {
+		struct gendisk *disk = dev_to_disk(dev);
+		struct disk_part_iter piter;
+		struct hd_struct *part;
+
+		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+
+		while (best_score < 3 && (part = disk_part_iter_next(&piter))) {
+			int score = part_matches_fs_info(part, seek);
+			if (score > best_score) {
+				devt = part_devt(part);
+				best_score = score;
+			}
+		}
+		disk_part_iter_exit(&piter);
+	}
+	class_dev_iter_exit(&iter);
+	return devt;
+}
+
+/* Caller uses NULL, key to start. For each match found, we return a bdev on
+ * which we have done blkdev_get, and we do the blkdev_put on block devices
+ * that are passed to us. When no more matches are found, we return NULL.
+ */
+struct block_device *next_bdev_of_type(struct block_device *last,
+	const char *key)
+{
+	dev_t devt = MKDEV(0, 0);
+	struct class_dev_iter iter;
+	struct device *dev;
+	struct block_device *next = NULL, *bdev;
+	int got_last = 0;
+
+	if (!key)
+		goto out;
+
+	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+	while (!devt && (dev = class_dev_iter_next(&iter))) {
+		struct gendisk *disk = dev_to_disk(dev);
+		struct disk_part_iter piter;
+		struct hd_struct *part;
+
+		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+
+		while ((part = disk_part_iter_next(&piter))) {
+			bdev = bdget(part_devt(part));
+			if (last && !got_last) {
+				if (last == bdev)
+					got_last = 1;
+				continue;
+			}
+
+			if (blkdev_get(bdev, FMODE_READ, 0))
+				continue;
+
+			if (bdev_matches_key(bdev, key)) {
+				next = bdev;
+				break;
+			}
+
+			blkdev_put(bdev, FMODE_READ);
+		}
+		disk_part_iter_exit(&piter);
+	}
+	class_dev_iter_exit(&iter);
+out:
+	if (last)
+		blkdev_put(last, FMODE_READ);
+	return next;
+}
+
 /*
  * Disk events - monitor disk events like media change and eject request.
  */
diff --git b/block/uuid.c b/block/uuid.c
new file mode 100644
index 0000000..c2416b9
--- /dev/null
+++ b/block/uuid.c
@@ -0,0 +1,509 @@
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/fs_uuid.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+static int debug_enabled;
+
+#define PRINTK(fmt, args...) do {					\
+	if (debug_enabled)						\
+		printk(KERN_DEBUG fmt, ## args);			\
+	} while(0)
+
+#define PRINT_HEX_DUMP(v1, v2, v3, v4, v5, v6, v7, v8)			\
+	do {								\
+		if (debug_enabled)					\
+			print_hex_dump(v1, v2, v3, v4, v5, v6, v7, v8);	\
+	} while(0)
+
+/*
+ * Simple UUID translation
+ */
+
+struct uuid_info {
+	const char *key;
+	const char *name;
+	long bkoff;
+	unsigned sboff;
+	unsigned sig_len;
+	const char *magic;
+	int uuid_offset;
+	int last_mount_offset;
+	int last_mount_size;
+};
+
+/*
+ * Based on libuuid's blkid_magic array. Note that I don't
+ * have uuid offsets for all of these yet - mssing ones are 0x0.
+ * Further information welcome.
+ *
+ * Rearranged by page of fs signature for optimisation.
+ */
+static struct uuid_info uuid_list[] = {
+ { NULL, "oracleasm", 0, 32, 8, "ORCLDISK", 0x0, 0, 0 },
+ { "ntfs", "ntfs", 0, 3, 8, "NTFS    ", 0x0, 0, 0 },
+ { "vfat", "vfat", 0, 0x52, 5, "MSWIN", 0x0, 0, 0 },
+ { "vfat", "vfat", 0, 0x52, 8, "FAT32   ", 0x0, 0, 0 },
+ { "vfat", "vfat", 0, 0x36, 5, "MSDOS", 0x0, 0, 0 },
+ { "vfat", "vfat", 0, 0x36, 8, "FAT16   ", 0x0, 0, 0 },
+ { "vfat", "vfat", 0, 0x36, 8, "FAT12   ", 0x0, 0, 0 },
+ { "vfat", "vfat", 0, 0, 1, "\353", 0x0, 0, 0 },
+ { "vfat", "vfat", 0, 0, 1, "\351", 0x0, 0, 0 },
+ { "vfat", "vfat", 0, 0x1fe, 2, "\125\252", 0x0, 0, 0 },
+ { "xfs", "xfs", 0, 0, 4, "XFSB", 0x20, 0, 0 },
+ { "romfs", "romfs", 0, 0, 8, "-rom1fs-", 0x0, 0, 0 },
+ { "bfs", "bfs", 0, 0, 4, "\316\372\173\033", 0, 0, 0 },
+ { "cramfs", "cramfs", 0, 0, 4, "E=\315\050", 0x0, 0, 0 },
+ { "qnx4", "qnx4", 0, 4, 6, "QNX4FS", 0, 0, 0 },
+ { NULL, "crypt_LUKS", 0, 0, 6, "LUKS\xba\xbe", 0x0, 0, 0 },
+ { "squashfs", "squashfs", 0, 0, 4, "sqsh", 0, 0, 0 },
+ { "squashfs", "squashfs", 0, 0, 4, "hsqs", 0, 0, 0 },
+ { "ocfs", "ocfs", 0, 8, 9, "OracleCFS", 0x0, 0, 0 },
+ { "lvm2pv", "lvm2pv", 0, 0x018, 8, "LVM2 001", 0x0, 0, 0 },
+ { "sysv", "sysv", 0, 0x3f8, 4, "\020~\030\375", 0, 0, 0 },
+ { "ext", "ext", 1, 0x38, 2, "\123\357", 0x468, 0x42c, 4 },
+ { "minix", "minix", 1, 0x10, 2, "\177\023", 0, 0, 0 },
+ { "minix", "minix", 1, 0x10, 2, "\217\023", 0, 0, 0 },
+ { "minix", "minix", 1, 0x10, 2, "\150\044", 0, 0, 0 },
+ { "minix", "minix", 1, 0x10, 2, "\170\044", 0, 0, 0 },
+ { "lvm2pv", "lvm2pv", 1, 0x018, 8, "LVM2 001", 0x0, 0, 0 },
+ { "vxfs", "vxfs", 1, 0, 4, "\365\374\001\245", 0, 0, 0 },
+ { "hfsplus", "hfsplus", 1, 0, 2, "BD", 0x0, 0, 0 },
+ { "hfsplus", "hfsplus", 1, 0, 2, "H+", 0x0, 0, 0 },
+ { "hfsplus", "hfsplus", 1, 0, 2, "HX", 0x0, 0, 0 },
+ { "hfs", "hfs", 1, 0, 2, "BD", 0x0, 0, 0 },
+ { "ocfs2", "ocfs2", 1, 0, 6, "OCFSV2", 0x0, 0, 0 },
+ { "lvm2pv", "lvm2pv", 0, 0x218, 8, "LVM2 001", 0x0, 0, 0 },
+ { "lvm2pv", "lvm2pv", 1, 0x218, 8, "LVM2 001", 0x0, 0, 0 },
+ { "ocfs2", "ocfs2", 2, 0, 6, "OCFSV2", 0x0, 0, 0 },
+ { "swap", "swap", 0, 0xff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
+ { "swap", "swap", 0, 0xff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0xff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0xff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0xff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
+ { "ocfs2", "ocfs2", 4, 0, 6, "OCFSV2", 0x0, 0, 0 },
+ { "ocfs2", "ocfs2", 8, 0, 6, "OCFSV2", 0x0, 0, 0 },
+ { "hpfs", "hpfs", 8, 0, 4, "I\350\225\371", 0, 0, 0 },
+ { "reiserfs", "reiserfs", 8, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 },
+ { "reiserfs", "reiserfs", 8, 20, 8, "ReIsErFs", 0x10054, 0, 0 },
+ { "zfs", "zfs", 8, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 },
+ { "zfs", "zfs", 8, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 },
+ { "ufs", "ufs", 8, 0x55c, 4, "T\031\001\000", 0, 0, 0 },
+ { "swap", "swap", 0, 0x1ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
+ { "swap", "swap", 0, 0x1ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0x1ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0x1ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0x1ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
+ { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr2Fs", 0x10054, 0, 0 },
+ { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr3Fs", 0x10054, 0, 0 },
+ { "reiserfs", "reiserfs", 64, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 },
+ { "reiser4", "reiser4", 64, 0, 7, "ReIsEr4", 0x100544, 0, 0 },
+ { "gfs2", "gfs2", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 },
+ { "gfs", "gfs", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 },
+ { "btrfs", "btrfs", 64, 0x40, 8, "_BHRfS_M", 0x0, 0, 0 },
+ { "swap", "swap", 0, 0x3ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
+ { "swap", "swap", 0, 0x3ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0x3ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0x3ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0x3ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
+ { "udf", "udf", 32, 1, 5, "BEA01", 0x0, 0, 0 },
+ { "udf", "udf", 32, 1, 5, "BOOT2", 0x0, 0, 0 },
+ { "udf", "udf", 32, 1, 5, "CD001", 0x0, 0, 0 },
+ { "udf", "udf", 32, 1, 5, "CDW02", 0x0, 0, 0 },
+ { "udf", "udf", 32, 1, 5, "NSR02", 0x0, 0, 0 },
+ { "udf", "udf", 32, 1, 5, "NSR03", 0x0, 0, 0 },
+ { "udf", "udf", 32, 1, 5, "TEA01", 0x0, 0, 0 },
+ { "iso9660", "iso9660", 32, 1, 5, "CD001", 0x0, 0, 0 },
+ { "iso9660", "iso9660", 32, 9, 5, "CDROM", 0x0, 0, 0 },
+ { "jfs", "jfs", 32, 0, 4, "JFS1", 0x88, 0, 0 },
+ { "swap", "swap", 0, 0x7ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
+ { "swap", "swap", 0, 0x7ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0x7ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0x7ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0x7ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
+ { "swap", "swap", 0, 0xfff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
+ { "swap", "swap", 0, 0xfff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0xfff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0xfff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
+ { "swap", "swsuspend", 0, 0xfff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
+ { "zfs", "zfs", 264, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 },
+ { "zfs", "zfs", 264, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 },
+ { NULL, NULL, 0, 0, 0, NULL, 0x0, 0, 0 }
+};
+
+static int null_uuid(const char *uuid)
+{
+	int i;
+
+	for (i = 0; i < 16 && !uuid[i]; i++);
+
+	return (i == 16);
+}
+
+
+static void uuid_end_bio(struct bio *bio)
+{
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+        if (bio->bi_error)
+            SetPageError(page);
+
+	unlock_page(page);
+	bio_put(bio);
+}
+
+
+/**
+ * read_bdev_page - Read a page from a device.
+ * @dev: The block device we're using.
+ * @page_num: The page we're reading.
+ *
+ * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
+ * textbook - allocate and initialize the bio. If we're writing, make sure
+ * the page is marked as dirty. Then submit it and carry on."
+ **/
+static struct page *read_bdev_page(struct block_device *dev, int page_num)
+{
+	struct bio *bio = NULL;
+	struct page *page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+
+	if (!page) {
+		printk(KERN_ERR "Failed to allocate a page for reading data "
+				"in UUID checks.");
+		return NULL;
+	}
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_bdev = dev;
+	bio->bi_iter.bi_sector = page_num << 3;
+	bio->bi_end_io = uuid_end_bio;
+	bio->bi_flags |= (1 << BIO_TOI);
+
+	PRINTK("Submitting bio on device %lx, page %d using bio %p and page %p.\n",
+			(unsigned long) dev->bd_dev, page_num, bio, page);
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk(KERN_DEBUG "ERROR: adding page to bio at %d\n",
+				page_num);
+		bio_put(bio);
+		__free_page(page);
+		printk(KERN_DEBUG "read_bdev_page freed page %p (in error "
+				"path).\n", page);
+		return NULL;
+	}
+
+	lock_page(page);
+	submit_bio(READ | REQ_SYNC, bio);
+
+	wait_on_page_locked(page);
+	if (PageError(page)) {
+		__free_page(page);
+		page = NULL;
+	}
+	return page;
+}
+
+int bdev_matches_key(struct block_device *bdev, const char *key)
+{
+	unsigned char *data = NULL;
+	struct page *data_page = NULL;
+
+	int dev_offset, pg_num, pg_off, i;
+	int last_pg_num = -1;
+	int result = 0;
+	char buf[50];
+
+	if (null_uuid(key)) {
+		PRINTK("Refusing to find a NULL key.\n");
+		return 0;
+	}
+
+	if (!bdev->bd_disk) {
+		bdevname(bdev, buf);
+		PRINTK("bdev %s has no bd_disk.\n", buf);
+		return 0;
+	}
+
+	if (!bdev->bd_disk->queue) {
+		bdevname(bdev, buf);
+		PRINTK("bdev %s has no queue.\n", buf);
+		return 0;
+	}
+
+	for (i = 0; uuid_list[i].name; i++) {
+		struct uuid_info *dat = &uuid_list[i];
+
+		if (!dat->key || strcmp(dat->key, key))
+			continue;
+
+		dev_offset = (dat->bkoff << 10) + dat->sboff;
+		pg_num = dev_offset >> 12;
+		pg_off = dev_offset & 0xfff;
+
+		if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1)
+			continue;
+
+		if (pg_num != last_pg_num) {
+			if (data_page) {
+				kunmap(data_page);
+				__free_page(data_page);
+			}
+			data_page = read_bdev_page(bdev, pg_num);
+			if (!data_page)
+				continue;
+			data = kmap(data_page);
+		}
+
+		last_pg_num = pg_num;
+
+		if (strncmp(&data[pg_off], dat->magic, dat->sig_len))
+			continue;
+
+		result = 1;
+		break;
+	}
+
+	if (data_page) {
+		kunmap(data_page);
+		__free_page(data_page);
+	}
+
+	return result;
+}
+
+/* 
+ * part_matches_fs_info - Does the given partition match the details given?
+ *
+ * Returns a score saying how good the match is.
+ * 0 = no UUID match.
+ * 1 = UUID but last mount time differs.
+ * 2 = UUID, last mount time but not dev_t
+ * 3 = perfect match
+ *
+ * This lets us cope elegantly with probing resulting in dev_ts changing
+ * from boot to boot, and with the case where a user copies a partition
+ * (UUID is non unique), and we need to check the last mount time of the
+ * correct partition.
+ */
+int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek)
+{
+	struct block_device *bdev;
+	struct fs_info *got;
+	int result = 0;
+	char buf[50];
+
+	if (null_uuid((char *) &seek->uuid)) {
+		PRINTK("Refusing to find a NULL uuid.\n");
+		return 0;
+	}
+
+	bdev = bdget(part_devt(part));
+
+	PRINTK("part_matches fs info considering %x.\n", part_devt(part));
+
+	if (blkdev_get(bdev, FMODE_READ, 0)) {
+		PRINTK("blkdev_get failed.\n");
+		return 0;
+	}
+
+	if (!bdev->bd_disk) {
+		bdevname(bdev, buf);
+		PRINTK("bdev %s has no bd_disk.\n", buf);
+		goto out;
+	}
+
+	if (!bdev->bd_disk->queue) {
+		bdevname(bdev, buf);
+		PRINTK("bdev %s has no queue.\n", buf);
+		goto out;
+	}
+
+	got = fs_info_from_block_dev(bdev);
+
+	if (got && !memcmp(got->uuid, seek->uuid, 16)) {
+		PRINTK(" Have matching UUID.\n");
+		PRINTK(" Got: LMS %d, LM %p.\n", got->last_mount_size, got->last_mount);
+		PRINTK(" Seek: LMS %d, LM %p.\n", seek->last_mount_size, seek->last_mount);
+		result = 1;
+
+		if (got->last_mount_size == seek->last_mount_size &&
+		    got->last_mount && seek->last_mount &&
+		    !memcmp(got->last_mount, seek->last_mount,
+			    got->last_mount_size)) {
+			result = 2;
+
+			PRINTK(" Matching last mount time.\n");
+
+			if (part_devt(part) == seek->dev_t) {
+				result = 3;
+				PRINTK(" Matching dev_t.\n");
+			} else
+				PRINTK("Dev_ts differ (%x vs %x).\n", part_devt(part), seek->dev_t);
+		}
+	}
+
+	PRINTK(" Score for %x is %d.\n", part_devt(part), result);
+	free_fs_info(got);
+out:
+	blkdev_put(bdev, FMODE_READ);
+	return result;
+}
+
+void free_fs_info(struct fs_info *fs_info)
+{
+	if (!fs_info || IS_ERR(fs_info))
+		return;
+
+	if (fs_info->last_mount)
+		kfree(fs_info->last_mount);
+
+	kfree(fs_info);
+}
+
+struct fs_info *fs_info_from_block_dev(struct block_device *bdev)
+{
+	unsigned char *data = NULL;
+	struct page *data_page = NULL;
+
+	int dev_offset, pg_num, pg_off;
+	int uuid_pg_num, uuid_pg_off, i;
+	unsigned char *uuid_data = NULL;
+	struct page *uuid_data_page = NULL;
+
+	int last_pg_num = -1, last_uuid_pg_num = 0;
+	char buf[50];
+	struct fs_info *fs_info = NULL;
+
+	bdevname(bdev, buf);
+
+	PRINTK("uuid_from_block_dev looking for partition type of %s.\n", buf);
+
+	for (i = 0; uuid_list[i].name; i++) {
+		struct uuid_info *dat = &uuid_list[i];
+		dev_offset = (dat->bkoff << 10) + dat->sboff;
+		pg_num = dev_offset >> 12;
+		pg_off = dev_offset & 0xfff;
+		uuid_pg_num = dat->uuid_offset >> 12;
+		uuid_pg_off = dat->uuid_offset & 0xfff;
+
+		if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1)
+			continue;
+
+		/* Ignore partition types with no UUID offset */
+		if (!dat->uuid_offset)
+			continue;
+
+		if (pg_num != last_pg_num) {
+			if (data_page) {
+				kunmap(data_page);
+				__free_page(data_page);
+			}
+			data_page = read_bdev_page(bdev, pg_num);
+			if (!data_page)
+				continue;
+			data = kmap(data_page);
+		}
+
+		last_pg_num = pg_num;
+
+		if (strncmp(&data[pg_off], dat->magic, dat->sig_len))
+			continue;
+
+		PRINTK("This partition looks like %s.\n", dat->name);
+
+		fs_info = kzalloc(sizeof(struct fs_info), GFP_KERNEL);
+
+		if (!fs_info) {
+			PRINTK("Failed to allocate fs_info struct.");
+			fs_info = ERR_PTR(-ENOMEM);
+			break;
+		}
+
+		/* UUID can't be off the end of the disk */
+		if ((uuid_pg_num > bdev->bd_part->nr_sects >> 3) ||
+				!dat->uuid_offset)
+			goto no_uuid;
+
+		if (!uuid_data || uuid_pg_num != last_uuid_pg_num) {
+			/* No need to reread the page from above */
+			if (uuid_pg_num == pg_num && uuid_data)
+				memcpy(uuid_data, data, PAGE_SIZE);
+			else {
+				if (uuid_data_page) {
+					kunmap(uuid_data_page);
+					__free_page(uuid_data_page);
+				}
+				uuid_data_page = read_bdev_page(bdev, uuid_pg_num);
+				if (!uuid_data_page)
+					continue;
+				uuid_data = kmap(uuid_data_page);
+			}
+		}
+
+		last_uuid_pg_num = uuid_pg_num;
+		memcpy(&fs_info->uuid, &uuid_data[uuid_pg_off], 16);
+		fs_info->dev_t = bdev->bd_dev;
+
+no_uuid:
+		PRINT_HEX_DUMP(KERN_EMERG, "fs_info_from_block_dev "
+				"returning uuid ", DUMP_PREFIX_NONE, 16, 1,
+				fs_info->uuid, 16, 0);
+
+		if (dat->last_mount_size) {
+			int pg = dat->last_mount_offset >> 12, sz;
+			int off = dat->last_mount_offset & 0xfff;
+			struct page *last_mount = read_bdev_page(bdev, pg);
+			unsigned char *last_mount_data;
+			char *ptr;
+
+			if (!last_mount) {
+				fs_info = ERR_PTR(-ENOMEM);
+				break;
+			}
+			last_mount_data = kmap(last_mount);
+			sz = dat->last_mount_size;
+			ptr = kmalloc(sz, GFP_KERNEL);
+
+			if (!ptr) {
+				printk(KERN_EMERG "fs_info_from_block_dev "
+					"failed to get memory for last mount "
+					"timestamp.");
+				free_fs_info(fs_info);
+				fs_info = ERR_PTR(-ENOMEM);
+			} else {
+				fs_info->last_mount = ptr;
+				fs_info->last_mount_size = sz;
+				memcpy(ptr, &last_mount_data[off], sz);
+			}
+
+			kunmap(last_mount);
+			__free_page(last_mount);
+		}
+		break;
+	}
+
+	if (data_page) {
+		kunmap(data_page);
+		__free_page(data_page);
+	}
+
+	if (uuid_data_page) {
+		kunmap(uuid_data_page);
+		__free_page(uuid_data_page);
+	}
+
+	return fs_info;
+}
+
+static int __init uuid_debug_setup(char *str)
+{
+	int value;
+
+	if (sscanf(str, "=%d", &value))
+		debug_enabled = value;
+
+	return 1;
+}
+
+__setup("uuid_debug", uuid_debug_setup);
diff --git a/drivers/accessibility/braille/braille_console.c b/drivers/accessibility/braille/braille_console.c
index dc34a5b..020118b 100644
--- a/drivers/accessibility/braille/braille_console.c
+++ b/drivers/accessibility/braille/braille_console.c
@@ -116,7 +116,7 @@ static void braille_write(u16 *buf)
 	*c++ = csum;
 	*c++ = ETX;
 
-	braille_co->write(braille_co, data, c - data);
+	braille_co->write(braille_co, data, c - data, 0);
 }
 
 /* Follow the VC cursor*/
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 55e257c..474f9f4 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2096,7 +2096,11 @@ static int ata_dev_config_ncq(struct ata_device *dev,
 		return 0;
 	}
 	if (ap->flags & ATA_FLAG_NCQ) {
+#ifdef CONFIG_PCK_INTERACTIVE
+		hdepth = min(ap->scsi_host->can_queue, 8);
+#else
 		hdepth = min(ap->scsi_host->can_queue, ATA_MAX_QUEUE - 1);
+#endif
 		dev->flags |= ATA_DFLAG_NCQ;
 	}
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 80cf8ad..ba9e4a7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -712,6 +712,24 @@ static inline int is_loop_device(struct file *file)
 	return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
 }
 
+/*
+ * for AUFS
+ * no get/put for file.
+ */
+struct file *loop_backing_file(struct super_block *sb)
+{
+	struct file *ret;
+	struct loop_device *l;
+
+	ret = NULL;
+	if (MAJOR(sb->s_dev) == LOOP_MAJOR) {
+		l = sb->s_bdev->bd_disk->private_data;
+		ret = l->lo_backing_file;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(loop_backing_file);
+
 /* loop sysfs attributes */
 
 static ssize_t loop_attr_show(struct device *dev, char *page,
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e979ec7..da8b7b9 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -25,6 +25,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
@@ -1900,6 +1901,12 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 	}
 
 out:
+	if (likely(retval != -EINVAL)) {
+		if (target_freq == policy->max)
+			cpu_nonscaling(policy->cpu);
+		else
+			cpu_scaling(policy->cpu);
+	}
 	return retval;
 }
 EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 606ad74..ce8a7e1 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -15,8 +15,14 @@
 #include "cpufreq_governor.h"
 
 /* Conservative governor macros */
+#ifdef CONFIG_SCHED_BFS
+#define DEF_FREQUENCY_UP_THRESHOLD		(63)
+#define DEF_FREQUENCY_DOWN_THRESHOLD		(26)
+#else
 #define DEF_FREQUENCY_UP_THRESHOLD		(80)
 #define DEF_FREQUENCY_DOWN_THRESHOLD		(20)
+#endif
+
 #define DEF_FREQUENCY_STEP			(5)
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(10)
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index eae5107..38fbd87 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -19,8 +19,17 @@
 #include "cpufreq_governor.h"
 
 /* On-demand governor macros */
+#ifdef CONFIG_SCHED_BFS
+#define DEF_FREQUENCY_UP_THRESHOLD		(63)
+#else
 #define DEF_FREQUENCY_UP_THRESHOLD		(80)
+#endif
+
+#ifdef CONFIG_PCK_INTERACTIVE
+#define DEF_SAMPLING_DOWN_FACTOR		(10)
+#else
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
+#endif
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
 #define MICRO_FREQUENCY_UP_THRESHOLD		(95)
 #define MICRO_FREQUENCY_MIN_SAMPLE_RATE		(10000)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e895123..3b29308 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -575,8 +575,13 @@ static void atom_set_pstate(struct cpudata *cpudata, int pstate)
 	vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max);
 	vid = ceiling_fp(vid_fp);
 
-	if (pstate > cpudata->pstate.max_pstate)
-		vid = cpudata->vid.turbo;
+	if (pstate < cpudata->pstate.max_pstate)
+		cpu_scaling(cpudata->cpu);
+	else {
+		if (pstate > cpudata->pstate.max_pstate)
+			vid = cpudata->vid.turbo;
+		cpu_nonscaling(cpudata->cpu);
+	}
 
 	val |= vid;
 
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 2e8c77e..8d75300 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -137,7 +137,7 @@ int drm_gem_object_init(struct drm_device *dev,
 
 	drm_gem_private_object_init(dev, obj, size);
 
-	filp = shmem_file_setup("drm mm object", size, VM_NORESERVE);
+	filp = shmem_file_setup("drm mm object", size, VM_NORESERVE, 1);
 	if (IS_ERR(filp))
 		return PTR_ERR(filp);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 2e832fa..da9a471 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -761,12 +761,12 @@ static int i915_drm_resume(struct drm_device *dev)
 		dev_priv->display.hpd_irq_setup(dev);
 	spin_unlock_irq(&dev_priv->irq_lock);
 
+	intel_dp_mst_resume(dev);
+
 	drm_modeset_lock_all(dev);
 	intel_display_resume(dev);
 	drm_modeset_unlock_all(dev);
 
-	intel_dp_mst_resume(dev);
-
 	/*
 	 * ... but also need to make sure that hotplug processing
 	 * doesn't cause havoc. Like in the driver load code we don't
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index 3cd4996..60ee237 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -6113,6 +6113,19 @@ void intel_dp_mst_resume(struct drm_device *dev)
 
 			ret = drm_dp_mst_topology_mgr_resume(&intel_dig_port->dp.mst_mgr);
 			if (ret != 0) {
+				/*
+				 * For some reason, some laptops can't bring
+				 * their MST docks back up immediately after
+				 * resume and need to wait a short period of
+				 * time before aux transactions with the dock
+				 * become functional again. Until we find a
+				 * proper fix for this, this workaround should
+				 * suffice
+				 */
+				msleep(30);
+				ret = drm_dp_mst_topology_mgr_resume(&intel_dig_port->dp.mst_mgr);
+			}
+			if (ret != 0) {
 				intel_dp_check_mst_status(&intel_dig_port->dp);
 			}
 		}
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index ca4c01f..2062228 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -2489,7 +2489,7 @@ int r600_init_microcode(struct radeon_device *rdev)
 	}
 
 	DRM_INFO("Loading %s Microcode\n", chip_name);
-
+#if 0
 	snprintf(fw_name, sizeof(fw_name), "/*(DEBLOBBED)*/", chip_name);
 	err = reject_firmware(&rdev->pfp_fw, fw_name, rdev->dev);
 	if (err)
@@ -2541,7 +2541,7 @@ int r600_init_microcode(struct radeon_device *rdev)
 			err = -EINVAL;
 		}
 	}
-
+#endif
 out:
 	if (err) {
 		if (err != -EINVAL)
@@ -3201,7 +3201,7 @@ int r600_init(struct radeon_device *rdev)
 	r = radeon_bo_init(rdev);
 	if (r)
 		return r;
-
+#if 0
 	if (!rdev->me_fw || !rdev->pfp_fw || !rdev->rlc_fw) {
 		r = r600_init_microcode(rdev);
 		if (r) {
@@ -3209,7 +3209,7 @@ int r600_init(struct radeon_device *rdev)
 			return r;
 		}
 	}
-
+#endif
 	/* Initialize power management */
 	radeon_pm_init(rdev);
 
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 4e19d0f..aaaf683 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -339,7 +339,7 @@ int ttm_tt_swapout(struct ttm_tt *ttm, struct file *persistent_swap_storage)
 	if (!persistent_swap_storage) {
 		swap_storage = shmem_file_setup("ttm swap",
 						ttm->num_pages << PAGE_SHIFT,
-						0);
+						0, 0);
 		if (IS_ERR(swap_storage)) {
 			pr_err("Failed allocating swap storage\n");
 			return PTR_ERR(swap_storage);
diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c
index 5d11fea..e819a60 100644
--- a/drivers/input/joydev.c
+++ b/drivers/input/joydev.c
@@ -28,15 +28,21 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/cdev.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
 
 MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
 MODULE_DESCRIPTION("Joystick device interfaces");
 MODULE_SUPPORTED_DEVICE("input/js");
 MODULE_LICENSE("GPL");
-
 #define JOYDEV_MINOR_BASE	0
 #define JOYDEV_MINORS		16
 #define JOYDEV_BUFFER_SIZE	64
+#define MAX_REMAP_SIZE          10
+
+static int remap_array[MAX_REMAP_SIZE];
+static int remap_count = 0;
+static int free_buttons[MAX_REMAP_SIZE];
 
 struct joydev {
 	int open;
@@ -71,6 +77,9 @@ struct joydev_client {
 	struct list_head node;
 };
 
+module_param_array(remap_array, int, &remap_count, 0 );
+MODULE_PARM_DESC( remap_array, "remap axis to buttons\n" );
+
 static int joydev_correct(int value, struct js_corr *corr)
 {
 	switch (corr->type) {
@@ -121,6 +130,17 @@ static void joydev_event(struct input_handle *handle,
 	struct joydev *joydev = handle->private;
 	struct joydev_client *client;
 	struct js_event event;
+        int i;
+
+        if( remap_count > 0 && remap_count < MAX_REMAP_SIZE ){
+                for( i = 0; i < remap_count; i++ )
+                        if( code == remap_array[i] ){
+                                type = EV_KEY;
+                                code = free_buttons[i];
+                                if( value == 255 )
+                                        value = 1;
+                        }
+        }
 
 	switch (type) {
 
@@ -816,7 +836,7 @@ static int joydev_connect(struct input_handler *handler, struct input_dev *dev,
 			  const struct input_device_id *id)
 {
 	struct joydev *joydev;
-	int i, j, t, minor, dev_no;
+	int i, j = 0, t, minor, dev_no;
 	int error;
 
 	minor = input_get_new_minor(JOYDEV_MINOR_BASE, JOYDEV_MINORS, true);
@@ -860,15 +880,24 @@ static int joydev_connect(struct input_handler *handler, struct input_dev *dev,
 			joydev->keymap[i] = joydev->nkey;
 			joydev->keypam[joydev->nkey] = i + BTN_MISC;
 			joydev->nkey++;
-		}
+                       j = i;
+                }
+        if( remap_count > 0 && remap_count < MAX_REMAP_SIZE ){
+                printk( "[joydev] axis remapping enabled\n" );
+                for( i = 0; i < remap_count; i++ ){
+                        joydev->keymap[j + i + 1] = joydev->nkey;
+                        joydev->keypam[joydev->nkey] = i + j + 1 + BTN_MISC;
+                        free_buttons[i] = j + i + 1 + BTN_MISC;
+                        joydev->nkey++;
 
+		}
 	for (i = 0; i < BTN_JOYSTICK - BTN_MISC; i++)
 		if (test_bit(i + BTN_MISC, dev->keybit)) {
 			joydev->keymap[i] = joydev->nkey;
 			joydev->keypam[joydev->nkey] = i + BTN_MISC;
 			joydev->nkey++;
 		}
-
+	}
 	for (i = 0; i < joydev->nabs; i++) {
 		j = joydev->abspam[i];
 		if (input_abs_get_max(dev, j) == input_abs_get_min(dev, j)) {
diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c
index a41d832..a5c2eb2 100644
--- a/drivers/input/mouse/synaptics.c
+++ b/drivers/input/mouse/synaptics.c
@@ -1251,7 +1251,9 @@ static void set_input_params(struct psmouse *psmouse,
 		/* Clickpads report only left button */
 		__clear_bit(BTN_RIGHT, dev->keybit);
 		__clear_bit(BTN_MIDDLE, dev->keybit);
-	}
+	} else if (SYN_CAP_CLICKPAD2BTN(priv->ext_cap_0c) ||
+		   SYN_CAP_CLICKPAD2BTN2(priv->ext_cap_0c))
+		__set_bit(INPUT_PROP_BUTTONPAD, dev->propbit);
 }
 
 static ssize_t synaptics_show_disable_gesture(struct psmouse *psmouse,
diff --git a/drivers/input/mouse/synaptics.h b/drivers/input/mouse/synaptics.h
index 56faa7e..c20f32c 100644
--- a/drivers/input/mouse/synaptics.h
+++ b/drivers/input/mouse/synaptics.h
@@ -85,6 +85,7 @@
  */
 #define SYN_CAP_CLICKPAD(ex0c)		((ex0c) & 0x100000) /* 1-button ClickPad */
 #define SYN_CAP_CLICKPAD2BTN(ex0c)	((ex0c) & 0x000100) /* 2-button ClickPad */
+#define SYN_CAP_CLICKPAD2BTN2(ex0c)	((ex0c) & 0x200000) /* 2-button ClickPad */
 #define SYN_CAP_MAX_DIMENSIONS(ex0c)	((ex0c) & 0x020000)
 #define SYN_CAP_MIN_DIMENSIONS(ex0c)	((ex0c) & 0x002000)
 #define SYN_CAP_ADV_GESTURE(ex0c)	((ex0c) & 0x080000)
diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c
index ce2fc48..e0c714f 100644
--- a/drivers/input/touchscreen/atmel_mxt_ts.c
+++ b/drivers/input/touchscreen/atmel_mxt_ts.c
@@ -1973,7 +1973,7 @@ static int mxt_initialize(struct mxt_data *data)
 	if (error)
 		goto err_free_object_table;
 
-	error = reject_firmware_nowait(THIS_MODULE, true, MXT_CFG_NAME,
+	error = request_firmware_nowait(THIS_MODULE, true, MXT_CFG_NAME,
 					&client->dev, GFP_KERNEL, data,
 					mxt_config_cb);
 	if (error) {
diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index 3e8b29e..c043885 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -171,6 +171,13 @@ config INPUT_ADBHID
 
 	  If unsure, say Y.
 
+config ADB_TRACKPAD_ABSOLUTE
+	bool "Enable absolute mode for adb trackpads"
+	depends on INPUT_ADBHID
+	help
+	  Enable absolute mode in adb-base trackpads. This feature adds
+	  compatibility with synaptics Xorg / Xfree drivers.
+
 config MAC_EMUMOUSEBTN
 	tristate "Support for mouse button 2+3 emulation"
 	depends on SYSCTL && INPUT
diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c
index 09d72bb..8d23b27 100644
--- a/drivers/macintosh/adbhid.c
+++ b/drivers/macintosh/adbhid.c
@@ -261,6 +261,15 @@ static struct adb_ids buttons_ids;
 #define ADBMOUSE_MS_A3		8	/* Mouse systems A3 trackball (handler 3) */
 #define ADBMOUSE_MACALLY2	9	/* MacAlly 2-button mouse */
 
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+#define	ABS_XMIN	310
+#define	ABS_XMAX	1700
+#define	ABS_YMIN	200
+#define	ABS_YMAX	1000
+#define	ABS_ZMIN	0
+#define	ABS_ZMAX	55
+#endif
+
 static void
 adbhid_keyboard_input(unsigned char *data, int nb, int apoll)
 {
@@ -405,6 +414,9 @@ static void
 adbhid_mouse_input(unsigned char *data, int nb, int autopoll)
 {
 	int id = (data[0] >> 4) & 0x0f;
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+	int btn = 0; int x_axis = 0; int y_axis = 0; int z_axis = 0;
+#endif
 
 	if (!adbhid[id]) {
 		printk(KERN_ERR "ADB HID on ID %d not yet registered\n", id);
@@ -436,6 +448,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll)
 	      high bits of y-axis motion.  XY is additional
 	      high bits of x-axis motion.
 
+    For ADB Absolute motion protocol the data array will contain the
+    following values:
+
+		BITS    COMMENTS
+    data[0] = dddd 1100 ADB command: Talk, register 0, for device dddd.
+    data[1] = byyy yyyy Left button and y-axis motion.
+    data[2] = bxxx xxxx Second button and x-axis motion.
+    data[3] = 1yyy 1xxx Half bits of y-axis and x-axis motion.
+    data[4] = 1yyy 1xxx Higher bits of y-axis and x-axis motion.
+    data[5] = 1zzz 1zzz Higher and lower bits of z-pressure.
+
     MacAlly 2-button mouse protocol.
 
     For MacAlly 2-button mouse protocol the data array will contain the
@@ -458,8 +481,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll)
 	switch (adbhid[id]->mouse_kind)
 	{
 	    case ADBMOUSE_TRACKPAD:
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+		x_axis = (data[2] & 0x7f) | ((data[3] & 0x07) << 7) |
+			((data[4] & 0x07) << 10);
+		y_axis = (data[1] & 0x7f) | ((data[3] & 0x70) << 3) |
+			((data[4] & 0x70) << 6);
+		z_axis = (data[5] & 0x07) | ((data[5] & 0x70) >> 1);
+		btn = (!(data[1] >> 7)) & 1;
+#else
 		data[1] = (data[1] & 0x7f) | ((data[1] & data[2]) & 0x80);
 		data[2] = data[2] | 0x80;
+#endif
 		break;
 	    case ADBMOUSE_MICROSPEED:
 		data[1] = (data[1] & 0x7f) | ((data[3] & 0x01) << 7);
@@ -485,17 +517,39 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll)
                 break;
 	}
 
-	input_report_key(adbhid[id]->input, BTN_LEFT,   !((data[1] >> 7) & 1));
-	input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1));
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+	if ( adbhid[id]->mouse_kind == ADBMOUSE_TRACKPAD ) {
 
-	if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD)
-		input_report_key(adbhid[id]->input, BTN_RIGHT,  !((data[3] >> 7) & 1));
+		if(z_axis > 30) input_report_key(adbhid[id]->input, BTN_TOUCH, 1);
+		if(z_axis < 25) input_report_key(adbhid[id]->input, BTN_TOUCH, 0);
 
-	input_report_rel(adbhid[id]->input, REL_X,
-			 ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 ));
-	input_report_rel(adbhid[id]->input, REL_Y,
-			 ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 ));
+		if(z_axis > 0){
+			input_report_abs(adbhid[id]->input, ABS_X, x_axis);
+			input_report_abs(adbhid[id]->input, ABS_Y, y_axis);
+			input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 1);
+			input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 5);
+		} else {
+			input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 0);
+			input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 0);
+		}
+
+		input_report_abs(adbhid[id]->input, ABS_PRESSURE, z_axis);
+		input_report_key(adbhid[id]->input, BTN_LEFT, btn);
+	} else {
+#endif
+		input_report_key(adbhid[id]->input, BTN_LEFT,   !((data[1] >> 7) & 1));
+		input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1));
+
+		if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD)
+			input_report_key(adbhid[id]->input, BTN_RIGHT,  !((data[3] >> 7) & 1));
 
+		input_report_rel(adbhid[id]->input, REL_X,
+				((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 ));
+		input_report_rel(adbhid[id]->input, REL_Y,
+				((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 ));
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+	}
+#endif
 	input_sync(adbhid[id]->input);
 }
 
@@ -849,6 +903,15 @@ adbhid_input_register(int id, int default_id, int original_handler_id,
 		input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) |
 			BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT);
 		input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y);
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+                set_bit(EV_ABS, input_dev->evbit);
+		input_set_abs_params(input_dev, ABS_X, ABS_XMIN, ABS_XMAX, 0, 0);
+		input_set_abs_params(input_dev, ABS_Y, ABS_YMIN, ABS_YMAX, 0, 0);
+		input_set_abs_params(input_dev, ABS_PRESSURE, ABS_ZMIN, ABS_ZMAX, 0, 0);
+		set_bit(BTN_TOUCH, input_dev->keybit);
+		set_bit(BTN_TOOL_FINGER, input_dev->keybit);
+		set_bit(ABS_TOOL_WIDTH, input_dev->absbit);
+#endif
 		break;
 
 	case ADB_MISC:
@@ -1132,7 +1195,11 @@ init_trackpad(int id)
 	            r1_buffer[3],
 	            r1_buffer[4],
 	            r1_buffer[5],
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+		    0x00, /* Enable absolute mode */
+#else
 	            0x03, /*r1_buffer[6],*/
+#endif
 	            r1_buffer[7]);
 
 	    /* Without this flush, the trackpad may be locked up */
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index 06ee639..e8f7fbf 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -829,7 +829,7 @@ static void send_ext_msg_udp(struct netconsole_target *nt, const char *msg,
 }
 
 static void write_ext_msg(struct console *con, const char *msg,
-			  unsigned int len)
+			  unsigned int len, unsigned int loglevel)
 {
 	struct netconsole_target *nt;
 	unsigned long flags;
@@ -844,7 +844,8 @@ static void write_ext_msg(struct console *con, const char *msg,
 	spin_unlock_irqrestore(&target_list_lock, flags);
 }
 
-static void write_msg(struct console *con, const char *msg, unsigned int len)
+static void write_msg(struct console *con, const char *msg, unsigned int len,
+                      unsigned int loglevel)
 {
 	int frag, left;
 	unsigned long flags;
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 69f93a5..06fa7a3 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -490,9 +490,30 @@ config THINKPAD_ACPI_HOTKEY_POLL
 	  If you are not sure, say Y here.  The driver enables polling only if
 	  it is strictly necessary to do so.
 
+config THINKPAD_EC
+	tristate
+	depends on X86
+	---help---
+	  This is a low-level driver for accessing the ThinkPad H8S embedded
+	  controller over the LPC bus (not to be confused with the ACPI Embedded
+	  Controller interface).
+
+config TP_SMAPI
+	tristate "ThinkPad SMAPI Support"
+	depends on X86
+	select THINKPAD_EC
+	default n
+	help
+	  This adds SMAPI support on Lenovo/IBM ThinkPads, for features such
+	  as battery charging control. For more information about this driver
+	  see <http://www.thinkwiki.org/wiki/tp_smapi>.
+
+	  If you have a Lenovo/IBM ThinkPad laptop, say Y or M here.
+
 config SENSORS_HDAPS
 	tristate "Thinkpad Hard Drive Active Protection System (hdaps)"
 	depends on INPUT && X86
+	select THINKPAD_EC
 	select INPUT_POLLDEV
 	default n
 	help
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 40574e7..926ab3e 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -26,6 +26,8 @@ obj-$(CONFIG_TC1100_WMI)	+= tc1100-wmi.o
 obj-$(CONFIG_SONY_LAPTOP)	+= sony-laptop.o
 obj-$(CONFIG_IDEAPAD_LAPTOP)	+= ideapad-laptop.o
 obj-$(CONFIG_THINKPAD_ACPI)	+= thinkpad_acpi.o
+obj-$(CONFIG_THINKPAD_EC)	+= thinkpad_ec.o
+obj-$(CONFIG_TP_SMAPI)		+= tp_smapi.o
 obj-$(CONFIG_SENSORS_HDAPS)	+= hdaps.o
 obj-$(CONFIG_FUJITSU_LAPTOP)	+= fujitsu-laptop.o
 obj-$(CONFIG_FUJITSU_TABLET)	+= fujitsu-tablet.o
diff --git a/drivers/platform/x86/hdaps.c b/drivers/platform/x86/hdaps.c
index 458e6c9..1dae4c5 100644
--- a/drivers/platform/x86/hdaps.c
+++ b/drivers/platform/x86/hdaps.c
@@ -30,266 +30,384 @@
 
 #include <linux/delay.h>
 #include <linux/platform_device.h>
-#include <linux/input-polldev.h>
+#include <linux/input.h>
 #include <linux/kernel.h>
-#include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/timer.h>
 #include <linux/dmi.h>
 #include <linux/jiffies.h>
-#include <linux/io.h>
-
-#define HDAPS_LOW_PORT		0x1600	/* first port used by hdaps */
-#define HDAPS_NR_PORTS		0x30	/* number of ports: 0x1600 - 0x162f */
-
-#define HDAPS_PORT_STATE	0x1611	/* device state */
-#define HDAPS_PORT_YPOS		0x1612	/* y-axis position */
-#define	HDAPS_PORT_XPOS		0x1614	/* x-axis position */
-#define HDAPS_PORT_TEMP1	0x1616	/* device temperature, in Celsius */
-#define HDAPS_PORT_YVAR		0x1617	/* y-axis variance (what is this?) */
-#define HDAPS_PORT_XVAR		0x1619	/* x-axis variance (what is this?) */
-#define HDAPS_PORT_TEMP2	0x161b	/* device temperature (again?) */
-#define HDAPS_PORT_UNKNOWN	0x161c	/* what is this? */
-#define HDAPS_PORT_KMACT	0x161d	/* keyboard or mouse activity */
-
-#define STATE_FRESH		0x50	/* accelerometer data is fresh */
+#include <linux/thinkpad_ec.h>
+#include <linux/pci_ids.h>
+#include <linux/version.h>
+
+/* Embedded controller accelerometer read command and its result: */
+static const struct thinkpad_ec_row ec_accel_args =
+	{ .mask = 0x0001, .val = {0x11} };
+#define EC_ACCEL_IDX_READOUTS	0x1	/* readouts included in this read */
+					/* First readout, if READOUTS>=1: */
+#define EC_ACCEL_IDX_YPOS1	0x2	/*   y-axis position word */
+#define EC_ACCEL_IDX_XPOS1	0x4	/*   x-axis position word */
+#define EC_ACCEL_IDX_TEMP1	0x6	/*   device temperature in Celsius */
+					/* Second readout, if READOUTS>=2: */
+#define EC_ACCEL_IDX_XPOS2	0x7	/*   y-axis position word */
+#define EC_ACCEL_IDX_YPOS2	0x9	/*   x-axis position word */
+#define EC_ACCEL_IDX_TEMP2	0xb	/*   device temperature in Celsius */
+#define EC_ACCEL_IDX_QUEUED	0xc	/* Number of queued readouts left */
+#define EC_ACCEL_IDX_KMACT	0xd	/* keyboard or mouse activity */
+#define EC_ACCEL_IDX_RETVAL	0xf	/* command return value, good=0x00 */
 
 #define KEYBD_MASK		0x20	/* set if keyboard activity */
 #define MOUSE_MASK		0x40	/* set if mouse activity */
-#define KEYBD_ISSET(n)		(!! (n & KEYBD_MASK))	/* keyboard used? */
-#define MOUSE_ISSET(n)		(!! (n & MOUSE_MASK))	/* mouse used? */
 
-#define INIT_TIMEOUT_MSECS	4000	/* wait up to 4s for device init ... */
-#define INIT_WAIT_MSECS		200	/* ... in 200ms increments */
+#define READ_TIMEOUT_MSECS	100	/* wait this long for device read */
+#define RETRY_MSECS		3	/* retry delay */
 
-#define HDAPS_POLL_INTERVAL	50	/* poll for input every 1/20s (50 ms)*/
 #define HDAPS_INPUT_FUZZ	4	/* input event threshold */
 #define HDAPS_INPUT_FLAT	4
-
-#define HDAPS_X_AXIS		(1 << 0)
-#define HDAPS_Y_AXIS		(1 << 1)
-#define HDAPS_BOTH_AXES		(HDAPS_X_AXIS | HDAPS_Y_AXIS)
-
+#define KMACT_REMEMBER_PERIOD   (HZ/10) /* keyboard/mouse persistance */
+
+/* Input IDs */
+#define HDAPS_INPUT_VENDOR	PCI_VENDOR_ID_IBM
+#define HDAPS_INPUT_PRODUCT	0x5054 /* "TP", shared with thinkpad_acpi */
+#define HDAPS_INPUT_JS_VERSION	0x6801 /* Joystick emulation input device */
+#define HDAPS_INPUT_RAW_VERSION	0x4801 /* Raw accelerometer input device */
+
+/* Axis orientation. */
+/* The unnatural bit-representation of inversions is for backward
+ * compatibility with the"invert=1" module parameter.             */
+#define HDAPS_ORIENT_INVERT_XY  0x01   /* Invert both X and Y axes.       */
+#define HDAPS_ORIENT_INVERT_X   0x02   /* Invert the X axis (uninvert if
+					* already inverted by INVERT_XY). */
+#define HDAPS_ORIENT_SWAP       0x04   /* Swap the axes. The swap occurs
+					* before inverting X or Y.        */
+#define HDAPS_ORIENT_MAX        0x07
+#define HDAPS_ORIENT_UNDEFINED  0xFF   /* Placeholder during initialization */
+#define HDAPS_ORIENT_INVERT_Y   (HDAPS_ORIENT_INVERT_XY | HDAPS_ORIENT_INVERT_X)
+
+static struct timer_list hdaps_timer;
 static struct platform_device *pdev;
-static struct input_polled_dev *hdaps_idev;
-static unsigned int hdaps_invert;
-static u8 km_activity;
-static int rest_x;
-static int rest_y;
-
-static DEFINE_MUTEX(hdaps_mtx);
-
-/*
- * __get_latch - Get the value from a given port.  Callers must hold hdaps_mtx.
- */
-static inline u8 __get_latch(u16 port)
+static struct input_dev *hdaps_idev;     /* joystick-like device with fuzz */
+static struct input_dev *hdaps_idev_raw; /* raw hdaps sensor readouts */
+static unsigned int hdaps_invert = HDAPS_ORIENT_UNDEFINED;
+static int needs_calibration;
+
+/* Configuration: */
+static int sampling_rate = 50;       /* Sampling rate  */
+static int oversampling_ratio = 5;   /* Ratio between our sampling rate and
+				      * EC accelerometer sampling rate      */
+static int running_avg_filter_order = 2; /* EC running average filter order */
+
+/* Latest state readout: */
+static int pos_x, pos_y;      /* position */
+static int temperature;       /* temperature */
+static int stale_readout = 1; /* last read invalid */
+static int rest_x, rest_y;    /* calibrated rest position */
+
+/* Last time we saw keyboard and mouse activity: */
+static u64 last_keyboard_jiffies = INITIAL_JIFFIES;
+static u64 last_mouse_jiffies = INITIAL_JIFFIES;
+static u64 last_update_jiffies = INITIAL_JIFFIES;
+
+/* input device use count */
+static int hdaps_users;
+static DEFINE_MUTEX(hdaps_users_mtx);
+
+/* Some models require an axis transformation to the standard representation */
+static void transform_axes(int *x, int *y)
 {
-	return inb(port) & 0xff;
+	if (hdaps_invert & HDAPS_ORIENT_SWAP) {
+		int z;
+		z = *x;
+		*x = *y;
+		*y = z;
+	}
+	if (hdaps_invert & HDAPS_ORIENT_INVERT_XY) {
+		*x = -*x;
+		*y = -*y;
+	}
+	if (hdaps_invert & HDAPS_ORIENT_INVERT_X)
+		*x = -*x;
 }
 
-/*
- * __check_latch - Check a port latch for a given value.  Returns zero if the
- * port contains the given value.  Callers must hold hdaps_mtx.
+/**
+ * __hdaps_update - query current state, with locks already acquired
+ * @fast: if nonzero, do one quick attempt without retries.
+ *
+ * Query current accelerometer state and update global state variables.
+ * Also prefetches the next query. Caller must hold controller lock.
  */
-static inline int __check_latch(u16 port, u8 val)
+static int __hdaps_update(int fast)
 {
-	if (__get_latch(port) == val)
-		return 0;
-	return -EINVAL;
-}
+	/* Read data: */
+	struct thinkpad_ec_row data;
+	int ret;
 
-/*
- * __wait_latch - Wait up to 100us for a port latch to get a certain value,
- * returning zero if the value is obtained.  Callers must hold hdaps_mtx.
- */
-static int __wait_latch(u16 port, u8 val)
-{
-	unsigned int i;
+	data.mask = (1 << EC_ACCEL_IDX_READOUTS) | (1 << EC_ACCEL_IDX_KMACT) |
+		    (3 << EC_ACCEL_IDX_YPOS1)    | (3 << EC_ACCEL_IDX_XPOS1) |
+		    (1 << EC_ACCEL_IDX_TEMP1)    | (1 << EC_ACCEL_IDX_RETVAL);
+	if (fast)
+		ret = thinkpad_ec_try_read_row(&ec_accel_args, &data);
+	else
+		ret = thinkpad_ec_read_row(&ec_accel_args, &data);
+	thinkpad_ec_prefetch_row(&ec_accel_args); /* Prefetch even if error */
+	if (ret)
+		return ret;
 
-	for (i = 0; i < 20; i++) {
-		if (!__check_latch(port, val))
-			return 0;
-		udelay(5);
+	/* Check status: */
+	if (data.val[EC_ACCEL_IDX_RETVAL] != 0x00) {
+		pr_warn("read RETVAL=0x%02x\n",
+		       data.val[EC_ACCEL_IDX_RETVAL]);
+		return -EIO;
+	}
+
+	if (data.val[EC_ACCEL_IDX_READOUTS] < 1)
+		return -EBUSY; /* no pending readout, try again later */
+
+	/* Parse position data: */
+	pos_x = *(s16 *)(data.val+EC_ACCEL_IDX_XPOS1);
+	pos_y = *(s16 *)(data.val+EC_ACCEL_IDX_YPOS1);
+	transform_axes(&pos_x, &pos_y);
+
+	/* Keyboard and mouse activity status is cleared as soon as it's read,
+	 * so applications will eat each other's events. Thus we remember any
+	 * event for KMACT_REMEMBER_PERIOD jiffies.
+	 */
+	if (data.val[EC_ACCEL_IDX_KMACT] & KEYBD_MASK)
+		last_keyboard_jiffies = get_jiffies_64();
+	if (data.val[EC_ACCEL_IDX_KMACT] & MOUSE_MASK)
+		last_mouse_jiffies = get_jiffies_64();
+
+	temperature = data.val[EC_ACCEL_IDX_TEMP1];
+
+	last_update_jiffies = get_jiffies_64();
+	stale_readout = 0;
+	if (needs_calibration) {
+		rest_x = pos_x;
+		rest_y = pos_y;
+		needs_calibration = 0;
 	}
 
-	return -EIO;
+	return 0;
 }
 
-/*
- * __device_refresh - request a refresh from the accelerometer.  Does not wait
- * for refresh to complete.  Callers must hold hdaps_mtx.
+/**
+ * hdaps_update - acquire locks and query current state
+ *
+ * Query current accelerometer state and update global state variables.
+ * Also prefetches the next query.
+ * Retries until timeout if the accelerometer is not in ready status (common).
+ * Does its own locking.
  */
-static void __device_refresh(void)
+static int hdaps_update(void)
 {
-	udelay(200);
-	if (inb(0x1604) != STATE_FRESH) {
-		outb(0x11, 0x1610);
-		outb(0x01, 0x161f);
+	u64 age = get_jiffies_64() - last_update_jiffies;
+	int total, ret;
+
+	if (!stale_readout && age < (9*HZ)/(10*sampling_rate))
+		return 0; /* already updated recently */
+	for (total = 0; total < READ_TIMEOUT_MSECS; total += RETRY_MSECS) {
+		ret = thinkpad_ec_lock();
+		if (ret)
+			return ret;
+		ret = __hdaps_update(0);
+		thinkpad_ec_unlock();
+
+		if (!ret)
+			return 0;
+		if (ret != -EBUSY)
+			break;
+		msleep(RETRY_MSECS);
 	}
+	return ret;
 }
 
-/*
- * __device_refresh_sync - request a synchronous refresh from the
- * accelerometer.  We wait for the refresh to complete.  Returns zero if
- * successful and nonzero on error.  Callers must hold hdaps_mtx.
+/**
+ * hdaps_set_power - enable or disable power to the accelerometer.
+ * Returns zero on success and negative error code on failure.  Can sleep.
  */
-static int __device_refresh_sync(void)
+static int hdaps_set_power(int on)
 {
-	__device_refresh();
-	return __wait_latch(0x1604, STATE_FRESH);
+	struct thinkpad_ec_row args =
+		{ .mask = 0x0003, .val = {0x14, on?0x01:0x00} };
+	struct thinkpad_ec_row data = { .mask = 0x8000 };
+	int ret = thinkpad_ec_read_row(&args, &data);
+	if (ret)
+		return ret;
+	if (data.val[0xF] != 0x00)
+		return -EIO;
+	return 0;
 }
 
-/*
- * __device_complete - indicate to the accelerometer that we are done reading
- * data, and then initiate an async refresh.  Callers must hold hdaps_mtx.
+/**
+ * hdaps_set_ec_config - set accelerometer parameters.
+ * @ec_rate: embedded controller sampling rate
+ * @order: embedded controller running average filter order
+ * (Normally we have @ec_rate = sampling_rate * oversampling_ratio.)
+ * Returns zero on success and negative error code on failure.  Can sleep.
  */
-static inline void __device_complete(void)
+static int hdaps_set_ec_config(int ec_rate, int order)
 {
-	inb(0x161f);
-	inb(0x1604);
-	__device_refresh();
+	struct thinkpad_ec_row args = { .mask = 0x000F,
+		.val = {0x10, (u8)ec_rate, (u8)(ec_rate>>8), order} };
+	struct thinkpad_ec_row data = { .mask = 0x8000 };
+	int ret = thinkpad_ec_read_row(&args, &data);
+	pr_debug("setting ec_rate=%d, filter_order=%d\n", ec_rate, order);
+	if (ret)
+		return ret;
+	if (data.val[0xF] == 0x03) {
+		pr_warn("config param out of range\n");
+		return -EINVAL;
+	}
+	if (data.val[0xF] == 0x06) {
+		pr_warn("config change already pending\n");
+		return -EBUSY;
+	}
+	if (data.val[0xF] != 0x00) {
+		pr_warn("config change error, ret=%d\n",
+		      data.val[0xF]);
+		return -EIO;
+	}
+	return 0;
 }
 
-/*
- * hdaps_readb_one - reads a byte from a single I/O port, placing the value in
- * the given pointer.  Returns zero on success or a negative error on failure.
- * Can sleep.
+/**
+ * hdaps_get_ec_config - get accelerometer parameters.
+ * @ec_rate: embedded controller sampling rate
+ * @order: embedded controller running average filter order
+ * Returns zero on success and negative error code on failure.  Can sleep.
  */
-static int hdaps_readb_one(unsigned int port, u8 *val)
+static int hdaps_get_ec_config(int *ec_rate, int *order)
 {
-	int ret;
-
-	mutex_lock(&hdaps_mtx);
-
-	/* do a sync refresh -- we need to be sure that we read fresh data */
-	ret = __device_refresh_sync();
+	const struct thinkpad_ec_row args =
+		{ .mask = 0x0003, .val = {0x17, 0x82} };
+	struct thinkpad_ec_row data = { .mask = 0x801F };
+	int ret = thinkpad_ec_read_row(&args, &data);
 	if (ret)
-		goto out;
-
-	*val = inb(port);
-	__device_complete();
-
-out:
-	mutex_unlock(&hdaps_mtx);
-	return ret;
+		return ret;
+	if (data.val[0xF] != 0x00)
+		return -EIO;
+	if (!(data.val[0x1] & 0x01))
+		return -ENXIO; /* accelerometer polling not enabled */
+	if (data.val[0x1] & 0x02)
+		return -EBUSY; /* config change in progress, retry later */
+	*ec_rate = data.val[0x2] | ((int)(data.val[0x3]) << 8);
+	*order = data.val[0x4];
+	return 0;
 }
 
-/* __hdaps_read_pair - internal lockless helper for hdaps_read_pair(). */
-static int __hdaps_read_pair(unsigned int port1, unsigned int port2,
-			     int *x, int *y)
+/**
+ * hdaps_get_ec_mode - get EC accelerometer mode
+ * Returns zero on success and negative error code on failure.  Can sleep.
+ */
+static int hdaps_get_ec_mode(u8 *mode)
 {
-	/* do a sync refresh -- we need to be sure that we read fresh data */
-	if (__device_refresh_sync())
+	const struct thinkpad_ec_row args =
+		{ .mask = 0x0001, .val = {0x13} };
+	struct thinkpad_ec_row data = { .mask = 0x8002 };
+	int ret = thinkpad_ec_read_row(&args, &data);
+	if (ret)
+		return ret;
+	if (data.val[0xF] != 0x00) {
+		pr_warn("accelerometer not implemented (0x%02x)\n",
+		       data.val[0xF]);
 		return -EIO;
-
-	*y = inw(port2);
-	*x = inw(port1);
-	km_activity = inb(HDAPS_PORT_KMACT);
-	__device_complete();
-
-	/* hdaps_invert is a bitvector to negate the axes */
-	if (hdaps_invert & HDAPS_X_AXIS)
-		*x = -*x;
-	if (hdaps_invert & HDAPS_Y_AXIS)
-		*y = -*y;
-
+	}
+	*mode = data.val[0x1];
 	return 0;
 }
 
-/*
- * hdaps_read_pair - reads the values from a pair of ports, placing the values
- * in the given pointers.  Returns zero on success.  Can sleep.
+/**
+ * hdaps_check_ec - checks something about the EC.
+ * Follows the clean-room spec for HDAPS; we don't know what it means.
+ * Returns zero on success and negative error code on failure.  Can sleep.
  */
-static int hdaps_read_pair(unsigned int port1, unsigned int port2,
-			   int *val1, int *val2)
+static int hdaps_check_ec(void)
 {
-	int ret;
-
-	mutex_lock(&hdaps_mtx);
-	ret = __hdaps_read_pair(port1, port2, val1, val2);
-	mutex_unlock(&hdaps_mtx);
-
-	return ret;
+	const struct thinkpad_ec_row args =
+		{ .mask = 0x0003, .val = {0x17, 0x81} };
+	struct thinkpad_ec_row data = { .mask = 0x800E };
+	int ret = thinkpad_ec_read_row(&args, &data);
+	if (ret)
+		return  ret;
+	if (!((data.val[0x1] == 0x00 && data.val[0x2] == 0x60) || /* cleanroom spec */
+	      (data.val[0x1] == 0x01 && data.val[0x2] == 0x00)) || /* seen on T61 */
+	    data.val[0x3] != 0x00 || data.val[0xF] != 0x00) {
+		pr_warn("hdaps_check_ec: bad response (0x%x,0x%x,0x%x,0x%x)\n",
+		       data.val[0x1], data.val[0x2],
+		       data.val[0x3], data.val[0xF]);
+		return -EIO;
+	}
+	return 0;
 }
 
-/*
- * hdaps_device_init - initialize the accelerometer.  Returns zero on success
- * and negative error code on failure.  Can sleep.
+/**
+ * hdaps_device_init - initialize the accelerometer.
+ *
+ * Call several embedded controller functions to test and initialize the
+ * accelerometer.
+ * Returns zero on success and negative error code on failure. Can sleep.
  */
+#define FAILED_INIT(msg) pr_err("init failed at: %s\n", msg)
 static int hdaps_device_init(void)
 {
-	int total, ret = -ENXIO;
+	int ret;
+	u8 mode;
 
-	mutex_lock(&hdaps_mtx);
+	ret = thinkpad_ec_lock();
+	if (ret)
+		return ret;
 
-	outb(0x13, 0x1610);
-	outb(0x01, 0x161f);
-	if (__wait_latch(0x161f, 0x00))
-		goto out;
+	if (hdaps_get_ec_mode(&mode))
+		{ FAILED_INIT("hdaps_get_ec_mode failed"); goto bad; }
 
-	/*
-	 * Most ThinkPads return 0x01.
-	 *
-	 * Others--namely the R50p, T41p, and T42p--return 0x03.  These laptops
-	 * have "inverted" axises.
-	 *
-	 * The 0x02 value occurs when the chip has been previously initialized.
-	 */
-	if (__check_latch(0x1611, 0x03) &&
-		     __check_latch(0x1611, 0x02) &&
-		     __check_latch(0x1611, 0x01))
-		goto out;
-
-	printk(KERN_DEBUG "hdaps: initial latch check good (0x%02x)\n",
-	       __get_latch(0x1611));
+	pr_debug("initial mode latch is 0x%02x\n", mode);
+	if (mode == 0x00)
+		{ FAILED_INIT("accelerometer not available"); goto bad; }
 
-	outb(0x17, 0x1610);
-	outb(0x81, 0x1611);
-	outb(0x01, 0x161f);
-	if (__wait_latch(0x161f, 0x00))
-		goto out;
-	if (__wait_latch(0x1611, 0x00))
-		goto out;
-	if (__wait_latch(0x1612, 0x60))
-		goto out;
-	if (__wait_latch(0x1613, 0x00))
-		goto out;
-	outb(0x14, 0x1610);
-	outb(0x01, 0x1611);
-	outb(0x01, 0x161f);
-	if (__wait_latch(0x161f, 0x00))
-		goto out;
-	outb(0x10, 0x1610);
-	outb(0xc8, 0x1611);
-	outb(0x00, 0x1612);
-	outb(0x02, 0x1613);
-	outb(0x01, 0x161f);
-	if (__wait_latch(0x161f, 0x00))
-		goto out;
-	if (__device_refresh_sync())
-		goto out;
-	if (__wait_latch(0x1611, 0x00))
-		goto out;
+	if (hdaps_check_ec())
+		{ FAILED_INIT("hdaps_check_ec failed"); goto bad; }
 
-	/* we have done our dance, now let's wait for the applause */
-	for (total = INIT_TIMEOUT_MSECS; total > 0; total -= INIT_WAIT_MSECS) {
-		int x, y;
+	if (hdaps_set_power(1))
+		{ FAILED_INIT("hdaps_set_power failed"); goto bad; }
 
-		/* a read of the device helps push it into action */
-		__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y);
-		if (!__wait_latch(0x1611, 0x02)) {
-			ret = 0;
-			break;
-		}
+	if (hdaps_set_ec_config(sampling_rate*oversampling_ratio,
+				running_avg_filter_order))
+		{ FAILED_INIT("hdaps_set_ec_config failed"); goto bad; }
 
-		msleep(INIT_WAIT_MSECS);
-	}
+	thinkpad_ec_invalidate();
+	udelay(200);
 
-out:
-	mutex_unlock(&hdaps_mtx);
+	/* Just prefetch instead of reading, to avoid ~1sec delay on load */
+	ret = thinkpad_ec_prefetch_row(&ec_accel_args);
+	if (ret)
+		{ FAILED_INIT("initial prefetch failed"); goto bad; }
+	goto good;
+bad:
+	thinkpad_ec_invalidate();
+	ret = -ENXIO;
+good:
+	stale_readout = 1;
+	thinkpad_ec_unlock();
 	return ret;
 }
 
+/**
+ * hdaps_device_shutdown - power off the accelerometer
+ * Returns nonzero on failure. Can sleep.
+ */
+static int hdaps_device_shutdown(void)
+{
+	int ret;
+	ret = hdaps_set_power(0);
+	if (ret) {
+		pr_warn("cannot power off\n");
+		return ret;
+	}
+	ret = hdaps_set_ec_config(0, 1);
+	if (ret)
+		pr_warn("cannot stop EC sampling\n");
+	return ret;
+}
 
 /* Device model stuff */
 
@@ -306,13 +424,29 @@ static int hdaps_probe(struct platform_device *dev)
 }
 
 #ifdef CONFIG_PM_SLEEP
+static int hdaps_suspend(struct device *dev)
+{
+	/* Don't do hdaps polls until resume re-initializes the sensor. */
+	del_timer_sync(&hdaps_timer);
+	hdaps_device_shutdown(); /* ignore errors, effect is negligible */
+	return 0;
+}
+
 static int hdaps_resume(struct device *dev)
 {
-	return hdaps_device_init();
+	int ret = hdaps_device_init();
+	if (ret)
+		return ret;
+
+	mutex_lock(&hdaps_users_mtx);
+	if (hdaps_users)
+		mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate);
+	mutex_unlock(&hdaps_users_mtx);
+	return 0;
 }
 #endif
 
-static SIMPLE_DEV_PM_OPS(hdaps_pm, NULL, hdaps_resume);
+static SIMPLE_DEV_PM_OPS(hdaps_pm, hdaps_suspend, hdaps_resume);
 
 static struct platform_driver hdaps_driver = {
 	.probe = hdaps_probe,
@@ -322,30 +456,47 @@ static struct platform_driver hdaps_driver = {
 	},
 };
 
-/*
- * hdaps_calibrate - Set our "resting" values.  Callers must hold hdaps_mtx.
+/**
+ * hdaps_calibrate - set our "resting" values.
+ * Does its own locking.
  */
 static void hdaps_calibrate(void)
 {
-	__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &rest_x, &rest_y);
+	needs_calibration = 1;
+	hdaps_update();
+	/* If that fails, the mousedev poll will take care of things later. */
 }
 
-static void hdaps_mousedev_poll(struct input_polled_dev *dev)
+/* Timer handler for updating the input device. Runs in softirq context,
+ * so avoid lenghty or blocking operations.
+ */
+static void hdaps_mousedev_poll(unsigned long unused)
 {
-	struct input_dev *input_dev = dev->input;
-	int x, y;
+	int ret;
 
-	mutex_lock(&hdaps_mtx);
+	stale_readout = 1;
 
-	if (__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y))
-		goto out;
+	/* Cannot sleep.  Try nonblockingly.  If we fail, try again later. */
+	if (thinkpad_ec_try_lock())
+		goto keep_active;
 
-	input_report_abs(input_dev, ABS_X, x - rest_x);
-	input_report_abs(input_dev, ABS_Y, y - rest_y);
-	input_sync(input_dev);
+	ret = __hdaps_update(1); /* fast update, we're in softirq context */
+	thinkpad_ec_unlock();
+	/* Any of "successful", "not yet ready" and "not prefetched"? */
+	if (ret != 0 && ret != -EBUSY && ret != -ENODATA) {
+		pr_err("poll failed, disabling updates\n");
+		return;
+	}
 
-out:
-	mutex_unlock(&hdaps_mtx);
+keep_active:
+	/* Even if we failed now, pos_x,y may have been updated earlier: */
+	input_report_abs(hdaps_idev, ABS_X, pos_x - rest_x);
+	input_report_abs(hdaps_idev, ABS_Y, pos_y - rest_y);
+	input_sync(hdaps_idev);
+	input_report_abs(hdaps_idev_raw, ABS_X, pos_x);
+	input_report_abs(hdaps_idev_raw, ABS_Y, pos_y);
+	input_sync(hdaps_idev_raw);
+	mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate);
 }
 
 
@@ -354,65 +505,41 @@ out:
 static ssize_t hdaps_position_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
-	int ret, x, y;
-
-	ret = hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "(%d,%d)\n", x, y);
-}
-
-static ssize_t hdaps_variance_show(struct device *dev,
-				   struct device_attribute *attr, char *buf)
-{
-	int ret, x, y;
-
-	ret = hdaps_read_pair(HDAPS_PORT_XVAR, HDAPS_PORT_YVAR, &x, &y);
+	int ret = hdaps_update();
 	if (ret)
 		return ret;
-
-	return sprintf(buf, "(%d,%d)\n", x, y);
+	return sprintf(buf, "(%d,%d)\n", pos_x, pos_y);
 }
 
 static ssize_t hdaps_temp1_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	u8 uninitialized_var(temp);
-	int ret;
-
-	ret = hdaps_readb_one(HDAPS_PORT_TEMP1, &temp);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "%u\n", temp);
-}
-
-static ssize_t hdaps_temp2_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	u8 uninitialized_var(temp);
-	int ret;
-
-	ret = hdaps_readb_one(HDAPS_PORT_TEMP2, &temp);
+	int ret = hdaps_update();
 	if (ret)
 		return ret;
-
-	return sprintf(buf, "%u\n", temp);
+	return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t hdaps_keyboard_activity_show(struct device *dev,
 					    struct device_attribute *attr,
 					    char *buf)
 {
-	return sprintf(buf, "%u\n", KEYBD_ISSET(km_activity));
+	int ret = hdaps_update();
+	if (ret)
+		return ret;
+	return sprintf(buf, "%u\n",
+	   get_jiffies_64() < last_keyboard_jiffies + KMACT_REMEMBER_PERIOD);
 }
 
 static ssize_t hdaps_mouse_activity_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
-	return sprintf(buf, "%u\n", MOUSE_ISSET(km_activity));
+	int ret = hdaps_update();
+	if (ret)
+		return ret;
+	return sprintf(buf, "%u\n",
+	   get_jiffies_64() < last_mouse_jiffies + KMACT_REMEMBER_PERIOD);
 }
 
 static ssize_t hdaps_calibrate_show(struct device *dev,
@@ -425,10 +552,7 @@ static ssize_t hdaps_calibrate_store(struct device *dev,
 				     struct device_attribute *attr,
 				     const char *buf, size_t count)
 {
-	mutex_lock(&hdaps_mtx);
 	hdaps_calibrate();
-	mutex_unlock(&hdaps_mtx);
-
 	return count;
 }
 
@@ -445,7 +569,7 @@ static ssize_t hdaps_invert_store(struct device *dev,
 	int invert;
 
 	if (sscanf(buf, "%d", &invert) != 1 ||
-	    invert < 0 || invert > HDAPS_BOTH_AXES)
+	    invert < 0 || invert > HDAPS_ORIENT_MAX)
 		return -EINVAL;
 
 	hdaps_invert = invert;
@@ -454,24 +578,128 @@ static ssize_t hdaps_invert_store(struct device *dev,
 	return count;
 }
 
+static ssize_t hdaps_sampling_rate_show(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", sampling_rate);
+}
+
+static ssize_t hdaps_sampling_rate_store(
+	struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	int rate, ret;
+	if (sscanf(buf, "%d", &rate) != 1 || rate > HZ || rate <= 0) {
+		pr_warn("must have 0<input_sampling_rate<=HZ=%d\n", HZ);
+		return -EINVAL;
+	}
+	ret = hdaps_set_ec_config(rate*oversampling_ratio,
+				  running_avg_filter_order);
+	if (ret)
+		return ret;
+	sampling_rate = rate;
+	return count;
+}
+
+static ssize_t hdaps_oversampling_ratio_show(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int ec_rate, order;
+	int ret = hdaps_get_ec_config(&ec_rate, &order);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%u\n", ec_rate / sampling_rate);
+}
+
+static ssize_t hdaps_oversampling_ratio_store(
+	struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	int ratio, ret;
+	if (sscanf(buf, "%d", &ratio) != 1 || ratio < 1)
+		return -EINVAL;
+	ret = hdaps_set_ec_config(sampling_rate*ratio,
+				  running_avg_filter_order);
+	if (ret)
+		return ret;
+	oversampling_ratio = ratio;
+	return count;
+}
+
+static ssize_t hdaps_running_avg_filter_order_show(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int rate, order;
+	int ret = hdaps_get_ec_config(&rate, &order);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%u\n", order);
+}
+
+static ssize_t hdaps_running_avg_filter_order_store(
+	struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	int order, ret;
+	if (sscanf(buf, "%d", &order) != 1)
+		return -EINVAL;
+	ret = hdaps_set_ec_config(sampling_rate*oversampling_ratio, order);
+	if (ret)
+		return ret;
+	running_avg_filter_order = order;
+	return count;
+}
+
+static int hdaps_mousedev_open(struct input_dev *dev)
+{
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mutex_lock(&hdaps_users_mtx);
+	if (hdaps_users++ == 0) /* first input user */
+		mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate);
+	mutex_unlock(&hdaps_users_mtx);
+	return 0;
+}
+
+static void hdaps_mousedev_close(struct input_dev *dev)
+{
+	mutex_lock(&hdaps_users_mtx);
+	if (--hdaps_users == 0) /* no input users left */
+		del_timer_sync(&hdaps_timer);
+	mutex_unlock(&hdaps_users_mtx);
+
+	module_put(THIS_MODULE);
+}
+
 static DEVICE_ATTR(position, 0444, hdaps_position_show, NULL);
-static DEVICE_ATTR(variance, 0444, hdaps_variance_show, NULL);
 static DEVICE_ATTR(temp1, 0444, hdaps_temp1_show, NULL);
-static DEVICE_ATTR(temp2, 0444, hdaps_temp2_show, NULL);
-static DEVICE_ATTR(keyboard_activity, 0444, hdaps_keyboard_activity_show, NULL);
+  /* "temp1" instead of "temperature" is hwmon convention */
+static DEVICE_ATTR(keyboard_activity, 0444,
+		   hdaps_keyboard_activity_show, NULL);
 static DEVICE_ATTR(mouse_activity, 0444, hdaps_mouse_activity_show, NULL);
-static DEVICE_ATTR(calibrate, 0644, hdaps_calibrate_show,hdaps_calibrate_store);
+static DEVICE_ATTR(calibrate, 0644,
+		   hdaps_calibrate_show, hdaps_calibrate_store);
 static DEVICE_ATTR(invert, 0644, hdaps_invert_show, hdaps_invert_store);
+static DEVICE_ATTR(sampling_rate, 0644,
+		   hdaps_sampling_rate_show, hdaps_sampling_rate_store);
+static DEVICE_ATTR(oversampling_ratio, 0644,
+		   hdaps_oversampling_ratio_show,
+		   hdaps_oversampling_ratio_store);
+static DEVICE_ATTR(running_avg_filter_order, 0644,
+		   hdaps_running_avg_filter_order_show,
+		   hdaps_running_avg_filter_order_store);
 
 static struct attribute *hdaps_attributes[] = {
 	&dev_attr_position.attr,
-	&dev_attr_variance.attr,
 	&dev_attr_temp1.attr,
-	&dev_attr_temp2.attr,
 	&dev_attr_keyboard_activity.attr,
 	&dev_attr_mouse_activity.attr,
 	&dev_attr_calibrate.attr,
 	&dev_attr_invert.attr,
+	&dev_attr_sampling_rate.attr,
+	&dev_attr_oversampling_ratio.attr,
+	&dev_attr_running_avg_filter_order.attr,
 	NULL,
 };
 
@@ -482,84 +710,77 @@ static struct attribute_group hdaps_attribute_group = {
 
 /* Module stuff */
 
-/* hdaps_dmi_match - found a match.  return one, short-circuiting the hunt. */
-static int __init hdaps_dmi_match(const struct dmi_system_id *id)
-{
-	pr_info("%s detected\n", id->ident);
-	return 1;
-}
-
 /* hdaps_dmi_match_invert - found an inverted match. */
 static int __init hdaps_dmi_match_invert(const struct dmi_system_id *id)
 {
-	hdaps_invert = (unsigned long)id->driver_data;
-	pr_info("inverting axis (%u) readings\n", hdaps_invert);
-	return hdaps_dmi_match(id);
+	unsigned int orient = (kernel_ulong_t) id->driver_data;
+	hdaps_invert = orient;
+	pr_info("%s detected, setting orientation %u\n", id->ident, orient);
+	return 1; /* stop enumeration */
 }
 
-#define HDAPS_DMI_MATCH_INVERT(vendor, model, axes) {	\
+#define HDAPS_DMI_MATCH_INVERT(vendor, model, orient) { \
 	.ident = vendor " " model,			\
 	.callback = hdaps_dmi_match_invert,		\
-	.driver_data = (void *)axes,			\
+	.driver_data = (void *)(orient),		\
 	.matches = {					\
 		DMI_MATCH(DMI_BOARD_VENDOR, vendor),	\
 		DMI_MATCH(DMI_PRODUCT_VERSION, model)	\
 	}						\
 }
 
-#define HDAPS_DMI_MATCH_NORMAL(vendor, model)		\
-	HDAPS_DMI_MATCH_INVERT(vendor, model, 0)
-
-/* Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match
-   "ThinkPad T42p", so the order of the entries matters.
-   If your ThinkPad is not recognized, please update to latest
-   BIOS. This is especially the case for some R52 ThinkPads. */
-static struct dmi_system_id __initdata hdaps_whitelist[] = {
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R50"),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R51"),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R52"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61i", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T41"),
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T42"),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T43"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61p", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X40"),
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_Y_AXIS),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61s", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad Z60m"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61m", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61p", HDAPS_BOTH_AXES),
+/* List of models with abnormal axis configuration.
+   Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match
+   "ThinkPad T42p", and enumeration stops after first match,
+   so the order of the entries matters. */
+struct dmi_system_id __initdata hdaps_whitelist[] = {
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X40", HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R400", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R500", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60 Tablet", HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60s", HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400s", HDAPS_ORIENT_INVERT_X),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410s", HDAPS_ORIENT_SWAP),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T500", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T510", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W51O", HDAPS_ORIENT_MAX),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201 Tablet", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X220", HDAPS_ORIENT_SWAP),
 	{ .ident = NULL }
 };
 
 static int __init hdaps_init(void)
 {
-	struct input_dev *idev;
 	int ret;
 
-	if (!dmi_check_system(hdaps_whitelist)) {
-		pr_warn("supported laptop not found!\n");
-		ret = -ENODEV;
-		goto out;
-	}
-
-	if (!request_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS, "hdaps")) {
-		ret = -ENXIO;
-		goto out;
-	}
+	/* Determine axis orientation orientation */
+	if (hdaps_invert == HDAPS_ORIENT_UNDEFINED) /* set by module param? */
+		if (dmi_check_system(hdaps_whitelist) < 1) /* in whitelist? */
+			hdaps_invert = 0; /* default */
 
+	/* Init timer before platform_driver_register, in case of suspend */
+	init_timer(&hdaps_timer);
+	hdaps_timer.function = hdaps_mousedev_poll;
 	ret = platform_driver_register(&hdaps_driver);
 	if (ret)
-		goto out_region;
+		goto out;
 
 	pdev = platform_device_register_simple("hdaps", -1, NULL, 0);
 	if (IS_ERR(pdev)) {
@@ -571,47 +792,79 @@ static int __init hdaps_init(void)
 	if (ret)
 		goto out_device;
 
-	hdaps_idev = input_allocate_polled_device();
+	hdaps_idev = input_allocate_device();
 	if (!hdaps_idev) {
 		ret = -ENOMEM;
 		goto out_group;
 	}
 
-	hdaps_idev->poll = hdaps_mousedev_poll;
-	hdaps_idev->poll_interval = HDAPS_POLL_INTERVAL;
-
-	/* initial calibrate for the input device */
-	hdaps_calibrate();
+	hdaps_idev_raw = input_allocate_device();
+	if (!hdaps_idev_raw) {
+		ret = -ENOMEM;
+		goto out_idev_first;
+	}
 
-	/* initialize the input class */
-	idev = hdaps_idev->input;
-	idev->name = "hdaps";
-	idev->phys = "isa1600/input0";
-	idev->id.bustype = BUS_ISA;
-	idev->dev.parent = &pdev->dev;
-	idev->evbit[0] = BIT_MASK(EV_ABS);
-	input_set_abs_params(idev, ABS_X,
+	/* calibration for the input device (deferred to avoid delay) */
+	needs_calibration = 1;
+
+	/* initialize the joystick-like fuzzed input device */
+	hdaps_idev->name = "ThinkPad HDAPS joystick emulation";
+	hdaps_idev->phys = "hdaps/input0";
+	hdaps_idev->id.bustype = BUS_HOST;
+	hdaps_idev->id.vendor  = HDAPS_INPUT_VENDOR;
+	hdaps_idev->id.product = HDAPS_INPUT_PRODUCT;
+	hdaps_idev->id.version = HDAPS_INPUT_JS_VERSION;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+	hdaps_idev->cdev.dev = &pdev->dev;
+#endif
+	hdaps_idev->evbit[0] = BIT(EV_ABS);
+	hdaps_idev->open = hdaps_mousedev_open;
+	hdaps_idev->close = hdaps_mousedev_close;
+	input_set_abs_params(hdaps_idev, ABS_X,
 			-256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT);
-	input_set_abs_params(idev, ABS_Y,
+	input_set_abs_params(hdaps_idev, ABS_Y,
 			-256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT);
 
-	ret = input_register_polled_device(hdaps_idev);
+	ret = input_register_device(hdaps_idev);
 	if (ret)
 		goto out_idev;
 
-	pr_info("driver successfully loaded\n");
+	/* initialize the raw data input device */
+	hdaps_idev_raw->name = "ThinkPad HDAPS accelerometer data";
+	hdaps_idev_raw->phys = "hdaps/input1";
+	hdaps_idev_raw->id.bustype = BUS_HOST;
+	hdaps_idev_raw->id.vendor  = HDAPS_INPUT_VENDOR;
+	hdaps_idev_raw->id.product = HDAPS_INPUT_PRODUCT;
+	hdaps_idev_raw->id.version = HDAPS_INPUT_RAW_VERSION;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+	hdaps_idev_raw->cdev.dev = &pdev->dev;
+#endif
+	hdaps_idev_raw->evbit[0] = BIT(EV_ABS);
+	hdaps_idev_raw->open = hdaps_mousedev_open;
+	hdaps_idev_raw->close = hdaps_mousedev_close;
+	input_set_abs_params(hdaps_idev_raw, ABS_X, -32768, 32767, 0, 0);
+	input_set_abs_params(hdaps_idev_raw, ABS_Y, -32768, 32767, 0, 0);
+
+	ret = input_register_device(hdaps_idev_raw);
+	if (ret)
+		goto out_idev_reg_first;
+
+	pr_info("driver successfully loaded.\n");
 	return 0;
 
+out_idev_reg_first:
+	input_unregister_device(hdaps_idev);
 out_idev:
-	input_free_polled_device(hdaps_idev);
+	input_free_device(hdaps_idev_raw);
+out_idev_first:
+	input_free_device(hdaps_idev);
 out_group:
 	sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group);
 out_device:
 	platform_device_unregister(pdev);
 out_driver:
 	platform_driver_unregister(&hdaps_driver);
-out_region:
-	release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS);
+	hdaps_device_shutdown();
 out:
 	pr_warn("driver init failed (ret=%d)!\n", ret);
 	return ret;
@@ -619,12 +872,12 @@ out:
 
 static void __exit hdaps_exit(void)
 {
-	input_unregister_polled_device(hdaps_idev);
-	input_free_polled_device(hdaps_idev);
+	input_unregister_device(hdaps_idev_raw);
+	input_unregister_device(hdaps_idev);
+	hdaps_device_shutdown(); /* ignore errors, effect is negligible */
 	sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group);
 	platform_device_unregister(pdev);
 	platform_driver_unregister(&hdaps_driver);
-	release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS);
 
 	pr_info("driver unloaded\n");
 }
@@ -632,9 +885,8 @@ static void __exit hdaps_exit(void)
 module_init(hdaps_init);
 module_exit(hdaps_exit);
 
-module_param_named(invert, hdaps_invert, int, 0);
-MODULE_PARM_DESC(invert, "invert data along each axis. 1 invert x-axis, "
-		 "2 invert y-axis, 3 invert both axes.");
+module_param_named(invert, hdaps_invert, uint, 0);
+MODULE_PARM_DESC(invert, "axis orientation code");
 
 MODULE_AUTHOR("Robert Love");
 MODULE_DESCRIPTION("IBM Hard Drive Active Protection System (HDAPS) driver");
diff --git b/drivers/platform/x86/thinkpad_ec.c b/drivers/platform/x86/thinkpad_ec.c
new file mode 100644
index 0000000..597614b
--- /dev/null
+++ b/drivers/platform/x86/thinkpad_ec.c
@@ -0,0 +1,513 @@
+/*
+ *  thinkpad_ec.c - ThinkPad embedded controller LPC3 functions
+ *
+ *  The embedded controller on ThinkPad laptops has a non-standard interface,
+ *  where LPC channel 3 of the H8S EC chip is hooked up to IO ports
+ *  0x1600-0x161F and implements (a special case of) the H8S LPC protocol.
+ *  The EC LPC interface provides various system management services (currently
+ *  known: battery information and accelerometer readouts). This driver
+ *  provides access and mutual exclusion for the EC interface.
+*
+ *  The LPC protocol and terminology are documented here:
+ *  "H8S/2104B Group Hardware Manual",
+ *  http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf
+ *
+ *  Copyright (C) 2006-2007 Shem Multinymous <multinymous@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/dmi.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/thinkpad_ec.h>
+#include <linux/jiffies.h>
+#include <asm/io.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+	#include <asm/semaphore.h>
+#else
+	#include <linux/semaphore.h>
+#endif
+
+#define TP_VERSION "0.42"
+
+MODULE_AUTHOR("Shem Multinymous");
+MODULE_DESCRIPTION("ThinkPad embedded controller hardware access");
+MODULE_VERSION(TP_VERSION);
+MODULE_LICENSE("GPL");
+
+/* IO ports used by embedded controller LPC channel 3: */
+#define TPC_BASE_PORT 0x1600
+#define TPC_NUM_PORTS 0x20
+#define TPC_STR3_PORT 0x1604  /* Reads H8S EC register STR3 */
+#define TPC_TWR0_PORT  0x1610 /* Mapped to H8S EC register TWR0MW/SW  */
+#define TPC_TWR15_PORT 0x161F /* Mapped to H8S EC register TWR15. */
+  /* (and port TPC_TWR0_PORT+i is mapped to H8S reg TWRi for 0<i<16) */
+
+/* H8S STR3 status flags (see "H8S/2104B Group Hardware Manual" p.549) */
+#define H8S_STR3_IBF3B 0x80  /* Bidi. Data Register Input Buffer Full */
+#define H8S_STR3_OBF3B 0x40  /* Bidi. Data Register Output Buffer Full */
+#define H8S_STR3_MWMF  0x20  /* Master Write Mode Flag */
+#define H8S_STR3_SWMF  0x10  /* Slave Write Mode Flag */
+#define H8S_STR3_MASK  0xF0  /* All bits we care about in STR3 */
+
+/* Timeouts and retries */
+#define TPC_READ_RETRIES     150
+#define TPC_READ_NDELAY      500
+#define TPC_REQUEST_RETRIES 1000
+#define TPC_REQUEST_NDELAY    10
+#define TPC_PREFETCH_TIMEOUT   (HZ/10)  /* invalidate prefetch after 0.1sec */
+
+/* A few macros for printk()ing: */
+#define MSG_FMT(fmt, args...) \
+  "thinkpad_ec: %s: " fmt "\n", __func__, ## args
+#define REQ_FMT(msg, code) \
+  MSG_FMT("%s: (0x%02x:0x%02x)->0x%02x", \
+	  msg, args->val[0x0], args->val[0xF], code)
+
+/* State of request prefetching: */
+static u8 prefetch_arg0, prefetch_argF;           /* Args of last prefetch */
+static u64 prefetch_jiffies;                      /* time of prefetch, or: */
+#define TPC_PREFETCH_NONE   INITIAL_JIFFIES       /*   No prefetch */
+#define TPC_PREFETCH_JUNK   (INITIAL_JIFFIES+1)   /*   Ignore prefetch */
+
+/* Locking: */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+static DECLARE_MUTEX(thinkpad_ec_mutex);
+#else
+static DEFINE_SEMAPHORE(thinkpad_ec_mutex);
+#endif
+
+/* Kludge in case the ACPI DSDT reserves the ports we need. */
+static bool force_io;    /* Willing to do IO to ports we couldn't reserve? */
+static int reserved_io; /* Successfully reserved the ports? */
+module_param_named(force_io, force_io, bool, 0600);
+MODULE_PARM_DESC(force_io, "Force IO even if region already reserved (0=off, 1=on)");
+
+/**
+ * thinkpad_ec_lock - get lock on the ThinkPad EC
+ *
+ * Get exclusive lock for accesing the ThinkPad embedded controller LPC3
+ * interface. Returns 0 iff lock acquired.
+ */
+int thinkpad_ec_lock(void)
+{
+	int ret;
+	ret = down_interruptible(&thinkpad_ec_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_lock);
+
+/**
+ * thinkpad_ec_try_lock - try getting lock on the ThinkPad EC
+ *
+ * Try getting an exclusive lock for accesing the ThinkPad embedded
+ * controller LPC3. Returns immediately if lock is not available; neither
+ * blocks nor sleeps. Returns 0 iff lock acquired .
+ */
+int thinkpad_ec_try_lock(void)
+{
+	return down_trylock(&thinkpad_ec_mutex);
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_try_lock);
+
+/**
+ * thinkpad_ec_unlock - release lock on ThinkPad EC
+ *
+ * Release a previously acquired exclusive lock on the ThinkPad ebmedded
+ * controller LPC3 interface.
+ */
+void thinkpad_ec_unlock(void)
+{
+	up(&thinkpad_ec_mutex);
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_unlock);
+
+/**
+ * thinkpad_ec_request_row - tell embedded controller to prepare a row
+ * @args Input register arguments
+ *
+ * Requests a data row by writing to H8S LPC registers TRW0 through TWR15 (or
+ * a subset thereof) following the protocol prescribed by the "H8S/2104B Group
+ * Hardware Manual". Does sanity checks via status register STR3.
+ */
+static int thinkpad_ec_request_row(const struct thinkpad_ec_row *args)
+{
+	u8 str3;
+	int i;
+
+	/* EC protocol requires write to TWR0 (function code): */
+	if (!(args->mask & 0x0001)) {
+		printk(KERN_ERR MSG_FMT("bad args->mask=0x%02x", args->mask));
+		return -EINVAL;
+	}
+
+	/* Check initial STR3 status: */
+	str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK;
+	if (str3 & H8S_STR3_OBF3B) { /* data already pending */
+		inb(TPC_TWR15_PORT); /* marks end of previous transaction */
+		if (prefetch_jiffies == TPC_PREFETCH_NONE)
+			printk(KERN_WARNING REQ_FMT(
+			       "EC has result from unrequested transaction",
+			       str3));
+		return -EBUSY; /* EC will be ready in a few usecs */
+	} else if (str3 == H8S_STR3_SWMF) { /* busy with previous request */
+		if (prefetch_jiffies == TPC_PREFETCH_NONE)
+			printk(KERN_WARNING REQ_FMT(
+			       "EC is busy with unrequested transaction",
+			       str3));
+		return -EBUSY; /* data will be pending in a few usecs */
+	} else if (str3 != 0x00) { /* unexpected status? */
+		printk(KERN_WARNING REQ_FMT("unexpected initial STR3", str3));
+		return -EIO;
+	}
+
+	/* Send TWR0MW: */
+	outb(args->val[0], TPC_TWR0_PORT);
+	str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK;
+	if (str3 != H8S_STR3_MWMF) { /* not accepted? */
+		printk(KERN_WARNING REQ_FMT("arg0 rejected", str3));
+		return -EIO;
+	}
+
+	/* Send TWR1 through TWR14: */
+	for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++)
+		if ((args->mask>>i)&1)
+			outb(args->val[i], TPC_TWR0_PORT+i);
+
+	/* Send TWR15 (default to 0x01). This marks end of command. */
+	outb((args->mask & 0x8000) ? args->val[0xF] : 0x01, TPC_TWR15_PORT);
+
+	/* Wait until EC starts writing its reply (~60ns on average).
+	 * Releasing locks before this happens may cause an EC hang
+	 * due to firmware bug!
+	 */
+	for (i = 0; i < TPC_REQUEST_RETRIES; i++) {
+		str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK;
+		if (str3 & H8S_STR3_SWMF) /* EC started replying */
+			return 0;
+		else if (!(str3 & ~(H8S_STR3_IBF3B|H8S_STR3_MWMF)))
+			/* Normal progress (the EC hasn't seen the request
+			 * yet, or is processing it). Wait it out. */
+			ndelay(TPC_REQUEST_NDELAY);
+		else { /* weird EC status */
+			printk(KERN_WARNING
+			       REQ_FMT("bad end STR3", str3));
+			return -EIO;
+		}
+	}
+	printk(KERN_WARNING REQ_FMT("EC is mysteriously silent", str3));
+	return -EIO;
+}
+
+/**
+ * thinkpad_ec_read_data - read pre-requested row-data from EC
+ * @args Input register arguments of pre-requested rows
+ * @data Output register values
+ *
+ * Reads current row data from the controller, assuming it's already
+ * requested. Follows the H8S spec for register access and status checks.
+ */
+static int thinkpad_ec_read_data(const struct thinkpad_ec_row *args,
+				 struct thinkpad_ec_row *data)
+{
+	int i;
+	u8 str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK;
+	/* Once we make a request, STR3 assumes the sequence of values listed
+	 * in the following 'if' as it reads the request and writes its data.
+	 * It takes about a few dozen nanosecs total, with very high variance.
+	 */
+	if (str3 == (H8S_STR3_IBF3B|H8S_STR3_MWMF) ||
+	    str3 == 0x00 ||  /* the 0x00 is indistinguishable from idle EC! */
+	    str3 == H8S_STR3_SWMF)
+		return -EBUSY; /* not ready yet */
+	/* Finally, the EC signals output buffer full: */
+	if (str3 != (H8S_STR3_OBF3B|H8S_STR3_SWMF)) {
+		printk(KERN_WARNING
+		       REQ_FMT("bad initial STR3", str3));
+		return -EIO;
+	}
+
+	/* Read first byte (signals start of read transactions): */
+	data->val[0] = inb(TPC_TWR0_PORT);
+	/* Optionally read 14 more bytes: */
+	for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++)
+		if ((data->mask >> i)&1)
+			data->val[i] = inb(TPC_TWR0_PORT+i);
+	/* Read last byte from 0x161F (signals end of read transaction): */
+	data->val[0xF] = inb(TPC_TWR15_PORT);
+
+	/* Readout still pending? */
+	str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK;
+	if (str3 & H8S_STR3_OBF3B)
+		printk(KERN_WARNING
+		       REQ_FMT("OBF3B=1 after read", str3));
+	/* If port 0x161F returns 0x80 too often, the EC may lock up. Warn: */
+	if (data->val[0xF] == 0x80)
+		printk(KERN_WARNING
+		       REQ_FMT("0x161F reports error", data->val[0xF]));
+	return 0;
+}
+
+/**
+ * thinkpad_ec_is_row_fetched - is the given row currently prefetched?
+ *
+ * To keep things simple we compare only the first and last args;
+ * this suffices for all known cases.
+ */
+static int thinkpad_ec_is_row_fetched(const struct thinkpad_ec_row *args)
+{
+	return (prefetch_jiffies != TPC_PREFETCH_NONE) &&
+	       (prefetch_jiffies != TPC_PREFETCH_JUNK) &&
+	       (prefetch_arg0 == args->val[0]) &&
+	       (prefetch_argF == args->val[0xF]) &&
+	       (get_jiffies_64() < prefetch_jiffies + TPC_PREFETCH_TIMEOUT);
+}
+
+/**
+ * thinkpad_ec_read_row - request and read data from ThinkPad EC
+ * @args Input register arguments
+ * @data Output register values
+ *
+ * Read a data row from the ThinkPad embedded controller LPC3 interface.
+ * Does fetching and retrying if needed. The row is specified by an
+ * array of 16 bytes, some of which may be undefined (but the first is
+ * mandatory). These bytes are given in @args->val[], where @args->val[i] is
+ * used iff (@args->mask>>i)&1). The resulting row data is stored in
+ * @data->val[], but is only guaranteed to be valid for indices corresponding
+ * to set bit in @data->mask. That is, if @data->mask&(1<<i)==0 then
+ * @data->val[i] is undefined.
+ *
+ * Returns -EBUSY on transient error and -EIO on abnormal condition.
+ * Caller must hold controller lock.
+ */
+int thinkpad_ec_read_row(const struct thinkpad_ec_row *args,
+			 struct thinkpad_ec_row *data)
+{
+	int retries, ret;
+
+	if (thinkpad_ec_is_row_fetched(args))
+		goto read_row; /* already requested */
+
+	/* Request the row */
+	for (retries = 0; retries < TPC_READ_RETRIES; ++retries) {
+		ret = thinkpad_ec_request_row(args);
+		if (!ret)
+			goto read_row;
+		if (ret != -EBUSY)
+			break;
+		ndelay(TPC_READ_NDELAY);
+	}
+	printk(KERN_ERR REQ_FMT("failed requesting row", ret));
+	goto out;
+
+read_row:
+	/* Read the row's data */
+	for (retries = 0; retries < TPC_READ_RETRIES; ++retries) {
+		ret = thinkpad_ec_read_data(args, data);
+		if (!ret)
+			goto out;
+		if (ret != -EBUSY)
+			break;
+		ndelay(TPC_READ_NDELAY);
+	}
+
+	printk(KERN_ERR REQ_FMT("failed waiting for data", ret));
+
+out:
+	prefetch_jiffies = TPC_PREFETCH_JUNK;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_read_row);
+
+/**
+ * thinkpad_ec_try_read_row - try reading prefetched data from ThinkPad EC
+ * @args Input register arguments
+ * @data Output register values
+ *
+ * Try reading a data row from the ThinkPad embedded controller LPC3
+ * interface, if this raw was recently prefetched using
+ * thinkpad_ec_prefetch_row(). Does not fetch, retry or block.
+ * The parameters have the same meaning as in thinkpad_ec_read_row().
+ *
+ * Returns -EBUSY is data not ready and -ENODATA if row not prefetched.
+ * Caller must hold controller lock.
+ */
+int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args,
+			     struct thinkpad_ec_row *data)
+{
+	int ret;
+	if (!thinkpad_ec_is_row_fetched(args)) {
+		ret = -ENODATA;
+	} else {
+		ret = thinkpad_ec_read_data(args, data);
+		if (!ret)
+			prefetch_jiffies = TPC_PREFETCH_NONE; /* eaten up */
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_try_read_row);
+
+/**
+ * thinkpad_ec_prefetch_row - prefetch data from ThinkPad EC
+ * @args Input register arguments
+ *
+ * Prefetch a data row from the ThinkPad embedded controller LCP3
+ * interface. A subsequent call to thinkpad_ec_read_row() with the
+ * same arguments will be faster, and a subsequent call to
+ * thinkpad_ec_try_read_row() stands a good chance of succeeding if
+ * done neither too soon nor too late. See
+ * thinkpad_ec_read_row() for the meaning of @args.
+ *
+ * Returns -EBUSY on transient error and -EIO on abnormal condition.
+ * Caller must hold controller lock.
+ */
+int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args)
+{
+	int ret;
+	ret = thinkpad_ec_request_row(args);
+	if (ret) {
+		prefetch_jiffies = TPC_PREFETCH_JUNK;
+	} else {
+		prefetch_jiffies = get_jiffies_64();
+		prefetch_arg0 = args->val[0x0];
+		prefetch_argF = args->val[0xF];
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_prefetch_row);
+
+/**
+ * thinkpad_ec_invalidate - invalidate prefetched ThinkPad EC data
+ *
+ * Invalidate the data prefetched via thinkpad_ec_prefetch_row() from the
+ * ThinkPad embedded controller LPC3 interface.
+ * Must be called before unlocking by any code that accesses the controller
+ * ports directly.
+ */
+void thinkpad_ec_invalidate(void)
+{
+	prefetch_jiffies = TPC_PREFETCH_JUNK;
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_invalidate);
+
+
+/*** Checking for EC hardware ***/
+
+/**
+ * thinkpad_ec_test - verify the EC is present and follows protocol
+ *
+ * Ensure the EC LPC3 channel really works on this machine by making
+ * an EC request and seeing if the EC follows the documented H8S protocol.
+ * The requested row just reads battery status, so it should be harmless to
+ * access it (on a correct EC).
+ * This test writes to IO ports, so execute only after checking DMI.
+ */
+static int __init thinkpad_ec_test(void)
+{
+	int ret;
+	const struct thinkpad_ec_row args = /* battery 0 basic status */
+	  { .mask = 0x8001, .val = {0x01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x00} };
+	struct thinkpad_ec_row data = { .mask = 0x0000 };
+	ret = thinkpad_ec_lock();
+	if (ret)
+		return ret;
+	ret = thinkpad_ec_read_row(&args, &data);
+	thinkpad_ec_unlock();
+	return ret;
+}
+
+/* Search all DMI device names of a given type for a substring */
+static int __init dmi_find_substring(int type, const char *substr)
+{
+	const struct dmi_device *dev = NULL;
+	while ((dev = dmi_find_device(type, NULL, dev))) {
+		if (strstr(dev->name, substr))
+			return 1;
+	}
+	return 0;
+}
+
+#define TP_DMI_MATCH(vendor,model)	{		\
+	.ident = vendor " " model,			\
+	.matches = {					\
+		DMI_MATCH(DMI_BOARD_VENDOR, vendor),	\
+		DMI_MATCH(DMI_PRODUCT_VERSION, model)	\
+	}						\
+}
+
+/* Check DMI for existence of ThinkPad embedded controller */
+static int __init check_dmi_for_ec(void)
+{
+	/* A few old models that have a good EC but don't report it in DMI */
+	struct dmi_system_id tp_whitelist[] = {
+		TP_DMI_MATCH("IBM", "ThinkPad A30"),
+		TP_DMI_MATCH("IBM", "ThinkPad T23"),
+		TP_DMI_MATCH("IBM", "ThinkPad X24"),
+		TP_DMI_MATCH("LENOVO", "ThinkPad"),
+		{ .ident = NULL }
+	};
+	return dmi_find_substring(DMI_DEV_TYPE_OEM_STRING,
+				  "IBM ThinkPad Embedded Controller") ||
+	       dmi_check_system(tp_whitelist);
+}
+
+/*** Init and cleanup ***/
+
+static int __init thinkpad_ec_init(void)
+{
+	if (!check_dmi_for_ec()) {
+		printk(KERN_WARNING
+		       "thinkpad_ec: no ThinkPad embedded controller!\n");
+		return -ENODEV;
+	}
+
+	if (request_region(TPC_BASE_PORT, TPC_NUM_PORTS, "thinkpad_ec")) {
+		reserved_io = 1;
+	} else {
+		printk(KERN_ERR "thinkpad_ec: cannot claim IO ports %#x-%#x... ",
+		       TPC_BASE_PORT,
+		       TPC_BASE_PORT + TPC_NUM_PORTS - 1);
+		if (force_io) {
+			printk("forcing use of unreserved IO ports.\n");
+		} else {
+			printk("consider using force_io=1.\n");
+			return -ENXIO;
+		}
+	}
+	prefetch_jiffies = TPC_PREFETCH_JUNK;
+	if (thinkpad_ec_test()) {
+		printk(KERN_ERR "thinkpad_ec: initial ec test failed\n");
+		if (reserved_io)
+			release_region(TPC_BASE_PORT, TPC_NUM_PORTS);
+		return -ENXIO;
+	}
+	printk(KERN_INFO "thinkpad_ec: thinkpad_ec " TP_VERSION " loaded.\n");
+	return 0;
+}
+
+static void __exit thinkpad_ec_exit(void)
+{
+	if (reserved_io)
+		release_region(TPC_BASE_PORT, TPC_NUM_PORTS);
+	printk(KERN_INFO "thinkpad_ec: unloaded.\n");
+}
+
+module_init(thinkpad_ec_init);
+module_exit(thinkpad_ec_exit);
diff --git b/drivers/platform/x86/tp_smapi.c b/drivers/platform/x86/tp_smapi.c
new file mode 100644
index 0000000..209cb64
--- /dev/null
+++ b/drivers/platform/x86/tp_smapi.c
@@ -0,0 +1,1493 @@
+/*
+ *  tp_smapi.c - ThinkPad SMAPI support
+ *
+ *  This driver exposes some features of the System Management Application
+ *  Program Interface (SMAPI) BIOS found on ThinkPad laptops. It works on
+ *  models in which the SMAPI BIOS runs in SMM and is invoked by writing
+ *  to the APM control port 0xB2.
+ *  It also exposes battery status information, obtained from the ThinkPad
+ *  embedded controller (via the thinkpad_ec module).
+ *  Ancient ThinkPad models use a different interface, supported by the
+ *  "thinkpad" module from "tpctl".
+ *
+ *  Many of the battery status values obtained from the EC simply mirror
+ *  values provided by the battery's Smart Battery System (SBS) interface, so
+ *  their meaning is defined by the Smart Battery Data Specification (see
+ *  http://sbs-forum.org/specs/sbdat110.pdf). References to this SBS spec
+ *  are given in the code where relevant.
+ *
+ *  Copyright (C) 2006 Shem Multinymous <multinymous@gmail.com>.
+ *  SMAPI access code based on the mwave driver by Mike Sullivan.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/proc_fs.h>
+#include <linux/mc146818rtc.h>	/* CMOS defines */
+#include <linux/delay.h>
+#include <linux/version.h>
+#include <linux/thinkpad_ec.h>
+#include <linux/platform_device.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#define TP_VERSION "0.42"
+#define TP_DESC "ThinkPad SMAPI Support"
+#define TP_DIR "smapi"
+
+MODULE_AUTHOR("Shem Multinymous");
+MODULE_DESCRIPTION(TP_DESC);
+MODULE_VERSION(TP_VERSION);
+MODULE_LICENSE("GPL");
+
+static struct platform_device *pdev;
+
+static int tp_debug;
+module_param_named(debug, tp_debug, int, 0600);
+MODULE_PARM_DESC(debug, "Debug level (0=off, 1=on)");
+
+/* A few macros for printk()ing: */
+#define TPRINTK(level, fmt, args...) \
+  dev_printk(level, &(pdev->dev), "%s: " fmt "\n", __func__, ## args)
+#define DPRINTK(fmt, args...) \
+  do { if (tp_debug) TPRINTK(KERN_DEBUG, fmt, ## args); } while (0)
+
+/*********************************************************************
+ * SMAPI interface
+ */
+
+/* SMAPI functions (register BX when making the SMM call). */
+#define SMAPI_GET_INHIBIT_CHARGE                0x2114
+#define SMAPI_SET_INHIBIT_CHARGE                0x2115
+#define SMAPI_GET_THRESH_START                  0x2116
+#define SMAPI_SET_THRESH_START                  0x2117
+#define SMAPI_GET_FORCE_DISCHARGE               0x2118
+#define SMAPI_SET_FORCE_DISCHARGE               0x2119
+#define SMAPI_GET_THRESH_STOP                   0x211a
+#define SMAPI_SET_THRESH_STOP                   0x211b
+
+/* SMAPI error codes (see ThinkPad 770 Technical Reference Manual p.83 at
+ http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD */
+#define SMAPI_RETCODE_EOF 0xff
+static struct { u8 rc; char *msg; int ret; } smapi_retcode[] =
+{
+	{0x00, "OK", 0},
+	{0x53, "SMAPI function is not available", -ENXIO},
+	{0x81, "Invalid parameter", -EINVAL},
+	{0x86, "Function is not supported by SMAPI BIOS", -EOPNOTSUPP},
+	{0x90, "System error", -EIO},
+	{0x91, "System is invalid", -EIO},
+	{0x92, "System is busy, -EBUSY"},
+	{0xa0, "Device error (disk read error)", -EIO},
+	{0xa1, "Device is busy", -EBUSY},
+	{0xa2, "Device is not attached", -ENXIO},
+	{0xa3, "Device is disbled", -EIO},
+	{0xa4, "Request parameter is out of range", -EINVAL},
+	{0xa5, "Request parameter is not accepted", -EINVAL},
+	{0xa6, "Transient error", -EBUSY}, /* ? */
+	{SMAPI_RETCODE_EOF, "Unknown error code", -EIO}
+};
+
+
+#define SMAPI_MAX_RETRIES 10
+#define SMAPI_PORT2 0x4F           /* fixed port, meaning unclear */
+static unsigned short smapi_port;  /* APM control port, normally 0xB2 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+static DECLARE_MUTEX(smapi_mutex);
+#else
+static DEFINE_SEMAPHORE(smapi_mutex);
+#endif
+
+/**
+ * find_smapi_port - read SMAPI port from NVRAM
+ */
+static int __init find_smapi_port(void)
+{
+	u16 smapi_id = 0;
+	unsigned short port = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	smapi_id = CMOS_READ(0x7C);
+	smapi_id |= (CMOS_READ(0x7D) << 8);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	if (smapi_id != 0x5349) {
+		printk(KERN_ERR "SMAPI not supported (ID=0x%x)\n", smapi_id);
+		return -ENXIO;
+	}
+	spin_lock_irqsave(&rtc_lock, flags);
+	port = CMOS_READ(0x7E);
+	port |= (CMOS_READ(0x7F) << 8);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+	if (port == 0) {
+		printk(KERN_ERR "unable to read SMAPI port number\n");
+		return -ENXIO;
+	}
+	return port;
+}
+
+/**
+ * smapi_request - make a SMAPI call
+ * @inEBX, @inECX, @inEDI, @inESI: input registers
+ * @outEBX, @outECX, @outEDX, @outEDI, @outESI: outputs registers
+ * @msg: textual error message
+ * Invokes the SMAPI SMBIOS with the given input and outpu args.
+ * All outputs are optional (can be %NULL).
+ * Returns 0 when successful, and a negative errno constant
+ * (see smapi_retcode above) upon failure.
+ */
+static int smapi_request(u32 inEBX, u32 inECX,
+			 u32 inEDI, u32 inESI,
+			 u32 *outEBX, u32 *outECX, u32 *outEDX,
+			 u32 *outEDI, u32 *outESI, const char **msg)
+{
+	int ret = 0;
+	int i;
+	int retries;
+	u8 rc;
+	/* Must use local vars for output regs, due to reg pressure. */
+	u32 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI;
+
+	for (retries = 0; retries < SMAPI_MAX_RETRIES; ++retries) {
+		DPRINTK("req_in: BX=%x CX=%x DI=%x SI=%x",
+			inEBX, inECX, inEDI, inESI);
+
+		/* SMAPI's SMBIOS call and thinkpad_ec end up using use
+		 * different interfaces to the same chip, so play it safe. */
+		ret = thinkpad_ec_lock();
+		if (ret)
+			return ret;
+
+		__asm__ __volatile__(
+			"movl  $0x00005380,%%eax\n\t"
+			"movl  %6,%%ebx\n\t"
+			"movl  %7,%%ecx\n\t"
+			"movl  %8,%%edi\n\t"
+			"movl  %9,%%esi\n\t"
+			"xorl  %%edx,%%edx\n\t"
+			"movw  %10,%%dx\n\t"
+			"out   %%al,%%dx\n\t"  /* trigger SMI to SMBIOS */
+			"out   %%al,$0x4F\n\t"
+			"movl  %%eax,%0\n\t"
+			"movl  %%ebx,%1\n\t"
+			"movl  %%ecx,%2\n\t"
+			"movl  %%edx,%3\n\t"
+			"movl  %%edi,%4\n\t"
+			"movl  %%esi,%5\n\t"
+			:"=m"(tmpEAX),
+			 "=m"(tmpEBX),
+			 "=m"(tmpECX),
+			 "=m"(tmpEDX),
+			 "=m"(tmpEDI),
+			 "=m"(tmpESI)
+			:"m"(inEBX), "m"(inECX), "m"(inEDI), "m"(inESI),
+			 "m"((u16)smapi_port)
+			:"%eax", "%ebx", "%ecx", "%edx", "%edi",
+			 "%esi");
+
+		thinkpad_ec_invalidate();
+		thinkpad_ec_unlock();
+
+		/* Don't let the next SMAPI access happen too quickly,
+		 * may case problems. (We're hold smapi_mutex).       */
+		msleep(50);
+
+		if (outEBX) *outEBX = tmpEBX;
+		if (outECX) *outECX = tmpECX;
+		if (outEDX) *outEDX = tmpEDX;
+		if (outESI) *outESI = tmpESI;
+		if (outEDI) *outEDI = tmpEDI;
+
+		/* Look up error code */
+		rc = (tmpEAX>>8)&0xFF;
+		for (i = 0; smapi_retcode[i].rc != SMAPI_RETCODE_EOF &&
+			    smapi_retcode[i].rc != rc; ++i) {}
+		ret = smapi_retcode[i].ret;
+		if (msg)
+			*msg = smapi_retcode[i].msg;
+
+		DPRINTK("req_out: AX=%x BX=%x CX=%x DX=%x DI=%x SI=%x r=%d",
+			 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI, ret);
+		if (ret)
+			TPRINTK(KERN_NOTICE, "SMAPI error: %s (func=%x)",
+				smapi_retcode[i].msg, inEBX);
+
+		if (ret != -EBUSY)
+			return ret;
+	}
+	return ret;
+}
+
+/* Convenience wrapper: discard output arguments */
+static int smapi_write(u32 inEBX, u32 inECX,
+		       u32 inEDI, u32 inESI, const char **msg)
+{
+	return smapi_request(inEBX, inECX, inEDI, inESI,
+			     NULL, NULL, NULL, NULL, NULL, msg);
+}
+
+
+/*********************************************************************
+ * Specific SMAPI services
+ * All of these functions return 0 upon success, and a negative errno
+ * constant (see smapi_retcode) on failure.
+ */
+
+enum thresh_type {
+	THRESH_STOP  = 0, /* the code assumes this is 0 for brevity */
+	THRESH_START
+};
+#define THRESH_NAME(which) ((which == THRESH_START) ? "start" : "stop")
+
+/**
+ * __get_real_thresh - read battery charge start/stop threshold from SMAPI
+ * @bat:    battery number (0 or 1)
+ * @which:  THRESH_START or THRESH_STOP
+ * @thresh: 1..99, 0=default 1..99, 0=default (pass this as-is to SMAPI)
+ * @outEDI: some additional state that needs to be preserved, meaning unknown
+ * @outESI: some additional state that needs to be preserved, meaning unknown
+ */
+static int __get_real_thresh(int bat, enum thresh_type which, int *thresh,
+			     u32 *outEDI, u32 *outESI)
+{
+	u32 ebx = (which == THRESH_START) ? SMAPI_GET_THRESH_START
+					  : SMAPI_GET_THRESH_STOP;
+	u32 ecx = (bat+1)<<8;
+	const char *msg;
+	int ret = smapi_request(ebx, ecx, 0, 0, NULL,
+				&ecx, NULL, outEDI, outESI, &msg);
+	if (ret) {
+		TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: %s",
+			THRESH_NAME(which), bat, msg);
+		return ret;
+	}
+	if (!(ecx&0x00000100)) {
+		TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: ecx=0%x",
+			THRESH_NAME(which), bat, ecx);
+		return -EIO;
+	}
+	if (thresh)
+		*thresh = ecx&0xFF;
+	return 0;
+}
+
+/**
+ * get_real_thresh - read battery charge start/stop threshold from SMAPI
+ * @bat:    battery number (0 or 1)
+ * @which:  THRESH_START or THRESH_STOP
+ * @thresh: 1..99, 0=default (passes as-is to SMAPI)
+ */
+static int get_real_thresh(int bat, enum thresh_type which, int *thresh)
+{
+	return __get_real_thresh(bat, which, thresh, NULL, NULL);
+}
+
+/**
+ * set_real_thresh - write battery start/top charge threshold to SMAPI
+ * @bat:    battery number (0 or 1)
+ * @which:  THRESH_START or THRESH_STOP
+ * @thresh: 1..99, 0=default (passes as-is to SMAPI)
+ */
+static int set_real_thresh(int bat, enum thresh_type which, int thresh)
+{
+	u32 ebx = (which == THRESH_START) ? SMAPI_SET_THRESH_START
+					  : SMAPI_SET_THRESH_STOP;
+	u32 ecx = ((bat+1)<<8) + thresh;
+	u32 getDI, getSI;
+	const char *msg;
+	int ret;
+
+	/* verify read before writing */
+	ret = __get_real_thresh(bat, which, NULL, &getDI, &getSI);
+	if (ret)
+		return ret;
+
+	ret = smapi_write(ebx, ecx, getDI, getSI, &msg);
+	if (ret)
+		TPRINTK(KERN_NOTICE, "set %s to %d for bat=%d failed: %s",
+			THRESH_NAME(which), thresh, bat, msg);
+	else
+		TPRINTK(KERN_INFO, "set %s to %d for bat=%d",
+			THRESH_NAME(which), thresh, bat);
+	return ret;
+}
+
+/**
+ * __get_inhibit_charge_minutes - get inhibit charge period from SMAPI
+ * @bat:     battery number (0 or 1)
+ * @minutes: period in minutes (1..65535 minutes, 0=disabled)
+ * @outECX: some additional state that needs to be preserved, meaning unknown
+ * Note that @minutes is the originally set value, it does not count down.
+ */
+static int __get_inhibit_charge_minutes(int bat, int *minutes, u32 *outECX)
+{
+	u32 ecx = (bat+1)<<8;
+	u32 esi;
+	const char *msg;
+	int ret = smapi_request(SMAPI_GET_INHIBIT_CHARGE, ecx, 0, 0,
+				NULL, &ecx, NULL, NULL, &esi, &msg);
+	if (ret) {
+		TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg);
+		return ret;
+	}
+	if (!(ecx&0x0100)) {
+		TPRINTK(KERN_NOTICE, "bad ecx=0x%x for bat=%d", ecx, bat);
+		return -EIO;
+	}
+	if (minutes)
+		*minutes = (ecx&0x0001)?esi:0;
+	if (outECX)
+		*outECX = ecx;
+	return 0;
+}
+
+/**
+ * get_inhibit_charge_minutes - get inhibit charge period from SMAPI
+ * @bat:     battery number (0 or 1)
+ * @minutes: period in minutes (1..65535 minutes, 0=disabled)
+ * Note that @minutes is the originally set value, it does not count down.
+ */
+static int get_inhibit_charge_minutes(int bat, int *minutes)
+{
+	return __get_inhibit_charge_minutes(bat, minutes, NULL);
+}
+
+/**
+ * set_inhibit_charge_minutes - write inhibit charge period to SMAPI
+ * @bat:     battery number (0 or 1)
+ * @minutes: period in minutes (1..65535 minutes, 0=disabled)
+ */
+static int set_inhibit_charge_minutes(int bat, int minutes)
+{
+	u32 ecx;
+	const char *msg;
+	int ret;
+
+	/* verify read before writing */
+	ret = __get_inhibit_charge_minutes(bat, NULL, &ecx);
+	if (ret)
+		return ret;
+
+	ecx = ((bat+1)<<8) | (ecx&0x00FE) | (minutes > 0 ? 0x0001 : 0x0000);
+	if (minutes > 0xFFFF)
+		minutes = 0xFFFF;
+	ret = smapi_write(SMAPI_SET_INHIBIT_CHARGE, ecx, 0, minutes, &msg);
+	if (ret)
+		TPRINTK(KERN_NOTICE,
+			"set to %d failed for bat=%d: %s", minutes, bat, msg);
+	else
+		TPRINTK(KERN_INFO, "set to %d for bat=%d\n", minutes, bat);
+	return ret;
+}
+
+
+/**
+ * get_force_discharge - get status of forced discharging from SMAPI
+ * @bat:     battery number (0 or 1)
+ * @enabled: 1 if forced discharged is enabled, 0 if not
+ */
+static int get_force_discharge(int bat, int *enabled)
+{
+	u32 ecx = (bat+1)<<8;
+	const char *msg;
+	int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0,
+				NULL, &ecx, NULL, NULL, NULL, &msg);
+	if (ret) {
+		TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg);
+		return ret;
+	}
+	*enabled = (!(ecx&0x00000100) && (ecx&0x00000001))?1:0;
+	return 0;
+}
+
+/**
+ * set_force_discharge - write status of forced discharging to SMAPI
+ * @bat:     battery number (0 or 1)
+ * @enabled: 1 if forced discharged is enabled, 0 if not
+ */
+static int set_force_discharge(int bat, int enabled)
+{
+	u32 ecx = (bat+1)<<8;
+	const char *msg;
+	int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0,
+				NULL, &ecx, NULL, NULL, NULL, &msg);
+	if (ret) {
+		TPRINTK(KERN_NOTICE, "get failed for bat=%d: %s", bat, msg);
+		return ret;
+	}
+	if (ecx&0x00000100) {
+		TPRINTK(KERN_NOTICE, "cannot force discharge bat=%d", bat);
+		return -EIO;
+	}
+
+	ecx = ((bat+1)<<8) | (ecx&0x000000FA) | (enabled?0x00000001:0);
+	ret = smapi_write(SMAPI_SET_FORCE_DISCHARGE, ecx, 0, 0, &msg);
+	if (ret)
+		TPRINTK(KERN_NOTICE, "set to %d failed for bat=%d: %s",
+			enabled, bat, msg);
+	else
+		TPRINTK(KERN_INFO, "set to %d for bat=%d", enabled, bat);
+	return ret;
+}
+
+
+/*********************************************************************
+ * Wrappers to threshold-related SMAPI functions, which handle default
+ * thresholds and related quirks.
+ */
+
+/* Minimum, default and minimum difference for battery charging thresholds: */
+#define MIN_THRESH_DELTA      4  /* Min delta between start and stop thresh */
+#define MIN_THRESH_START      2
+#define MAX_THRESH_START      (100-MIN_THRESH_DELTA)
+#define MIN_THRESH_STOP       (MIN_THRESH_START + MIN_THRESH_DELTA)
+#define MAX_THRESH_STOP       100
+#define DEFAULT_THRESH_START  MAX_THRESH_START
+#define DEFAULT_THRESH_STOP   MAX_THRESH_STOP
+
+/* The GUI of IBM's Battery Maximizer seems to show a start threshold that
+ * is 1 more than the value we set/get via SMAPI. Since the threshold is
+ * maintained across reboot, this can be confusing. So we kludge our
+ * interface for interoperability: */
+#define BATMAX_FIX   1
+
+/* Get charge start/stop threshold (1..100),
+ * substituting default values if needed and applying BATMAT_FIX. */
+static int get_thresh(int bat, enum thresh_type which, int *thresh)
+{
+	int ret = get_real_thresh(bat, which, thresh);
+	if (ret)
+		return ret;
+	if (*thresh == 0)
+		*thresh = (which == THRESH_START) ? DEFAULT_THRESH_START
+						  : DEFAULT_THRESH_STOP;
+	else if (which == THRESH_START)
+		*thresh += BATMAX_FIX;
+	return 0;
+}
+
+
+/* Set charge start/stop threshold (1..100),
+ * substituting default values if needed and applying BATMAT_FIX. */
+static int set_thresh(int bat, enum thresh_type which, int thresh)
+{
+	if (which == THRESH_STOP && thresh == DEFAULT_THRESH_STOP)
+		thresh = 0; /* 100 is out of range, but default means 100 */
+	if (which == THRESH_START)
+		thresh -= BATMAX_FIX;
+	return set_real_thresh(bat, which, thresh);
+}
+
+/*********************************************************************
+ * ThinkPad embedded controller readout and basic functions
+ */
+
+/**
+ * read_tp_ec_row - read data row from the ThinkPad embedded controller
+ * @arg0: EC command code
+ * @bat: battery number, 0 or 1
+ * @j: the byte value to be used for "junk" (unused) input/outputs
+ * @dataval: result vector
+ */
+static int read_tp_ec_row(u8 arg0, int bat, u8 j, u8 *dataval)
+{
+	int ret;
+	const struct thinkpad_ec_row args = { .mask = 0xFFFF,
+		.val = {arg0, j,j,j,j,j,j,j,j,j,j,j,j,j,j, (u8)bat} };
+	struct thinkpad_ec_row data = { .mask = 0xFFFF };
+
+	ret = thinkpad_ec_lock();
+	if (ret)
+		return ret;
+	ret = thinkpad_ec_read_row(&args, &data);
+	thinkpad_ec_unlock();
+	memcpy(dataval, &data.val, TP_CONTROLLER_ROW_LEN);
+	return ret;
+}
+
+/**
+ * power_device_present - check for presence of battery or AC power
+ * @bat: 0 for battery 0, 1 for battery 1, otherwise AC power
+ * Returns 1 if present, 0 if not present, negative if error.
+ */
+static int power_device_present(int bat)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	u8 test;
+	int ret = read_tp_ec_row(1, bat, 0, row);
+	if (ret)
+		return ret;
+	switch (bat) {
+	case 0:  test = 0x40; break; /* battery 0 */
+	case 1:  test = 0x20; break; /* battery 1 */
+	default: test = 0x80;        /* AC power */
+	}
+	return (row[0] & test) ? 1 : 0;
+}
+
+/**
+ * bat_has_status - check if battery can report detailed status
+ * @bat: 0 for battery 0, 1 for battery 1
+ * Returns 1 if yes, 0 if no, negative if error.
+ */
+static int bat_has_status(int bat)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	int ret = read_tp_ec_row(1, bat, 0, row);
+	if (ret)
+		return ret;
+	if ((row[0] & (bat?0x20:0x40)) == 0) /* no battery */
+		return 0;
+	if ((row[1] & (0x60)) == 0) /* no status */
+		return 0;
+	return 1;
+}
+
+/**
+ * get_tp_ec_bat_16 - read a 16-bit value from EC battery status data
+ * @arg0: first argument to EC
+ * @off: offset in row returned from EC
+ * @bat: battery (0 or 1)
+ * @val: the 16-bit value obtained
+ * Returns nonzero on error.
+ */
+static int get_tp_ec_bat_16(u8 arg0, int offset, int bat, u16 *val)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	int ret;
+	if (bat_has_status(bat) != 1)
+		return -ENXIO;
+	ret = read_tp_ec_row(arg0, bat, 0, row);
+	if (ret)
+		return ret;
+	*val = *(u16 *)(row+offset);
+	return 0;
+}
+
+/*********************************************************************
+ * sysfs attributes for batteries -
+ * definitions and helper functions
+ */
+
+/* A custom device attribute struct which holds a battery number */
+struct bat_device_attribute {
+	struct device_attribute dev_attr;
+	int bat;
+};
+
+/**
+ * attr_get_bat - get the battery to which the attribute belongs
+ */
+static int attr_get_bat(struct device_attribute *attr)
+{
+	return container_of(attr, struct bat_device_attribute, dev_attr)->bat;
+}
+
+/**
+ * show_tp_ec_bat_u16 - show an unsigned 16-bit battery attribute
+ * @arg0: specified 1st argument of EC raw to read
+ * @offset: byte offset in EC raw data
+ * @mul: correction factor to multiply by
+ * @na_msg: string to output is value not available (0xFFFFFFFF)
+ * @attr: battery attribute
+ * @buf: output buffer
+ * The 16-bit value is read from the EC, treated as unsigned,
+ * transformed as x->mul*x, and printed to the buffer.
+ * If the value is 0xFFFFFFFF and na_msg!=%NULL, na_msg is printed instead.
+ */
+static ssize_t show_tp_ec_bat_u16(u8 arg0, int offset, int mul,
+			      const char *na_msg,
+			      struct device_attribute *attr, char *buf)
+{
+	u16 val;
+	int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val);
+	if (ret)
+		return ret;
+	if (na_msg && val == 0xFFFF)
+		return sprintf(buf, "%s\n", na_msg);
+	else
+		return sprintf(buf, "%u\n", mul*(unsigned int)val);
+}
+
+/**
+ * show_tp_ec_bat_s16 - show an signed 16-bit battery attribute
+ * @arg0: specified 1st argument of EC raw to read
+ * @offset: byte offset in EC raw data
+ * @mul: correction factor to multiply by
+ * @add: correction term to add after multiplication
+ * @attr: battery attribute
+ * @buf: output buffer
+ * The 16-bit value is read from the EC, treated as signed,
+ * transformed as x->mul*x+add, and printed to the buffer.
+ */
+static ssize_t show_tp_ec_bat_s16(u8 arg0, int offset, int mul, int add,
+			      struct device_attribute *attr, char *buf)
+{
+	u16 val;
+	int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%d\n", mul*(s16)val+add);
+}
+
+/**
+ * show_tp_ec_bat_str - show a string from EC battery status data
+ * @arg0: specified 1st argument of EC raw to read
+ * @offset: byte offset in EC raw data
+ * @maxlen: maximum string length
+ * @attr: battery attribute
+ * @buf: output buffer
+ */
+static ssize_t show_tp_ec_bat_str(u8 arg0, int offset, int maxlen,
+			      struct device_attribute *attr, char *buf)
+{
+	int bat = attr_get_bat(attr);
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	int ret;
+	if (bat_has_status(bat) != 1)
+		return -ENXIO;
+	ret = read_tp_ec_row(arg0, bat, 0, row);
+	if (ret)
+		return ret;
+	strncpy(buf, (char *)row+offset, maxlen);
+	buf[maxlen] = 0;
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+/**
+ * show_tp_ec_bat_power - show a power readout from EC battery status data
+ * @arg0: specified 1st argument of EC raw to read
+ * @offV: byte offset of voltage in EC raw data
+ * @offI: byte offset of current in EC raw data
+ * @attr: battery attribute
+ * @buf: output buffer
+ * Computes the power as current*voltage from the two given readout offsets.
+ */
+static ssize_t show_tp_ec_bat_power(u8 arg0, int offV, int offI,
+				struct device_attribute *attr, char *buf)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	int milliamp, millivolt, ret;
+	int bat = attr_get_bat(attr);
+	if (bat_has_status(bat) != 1)
+		return -ENXIO;
+	ret = read_tp_ec_row(1, bat, 0, row);
+	if (ret)
+		return ret;
+	millivolt = *(u16 *)(row+offV);
+	milliamp = *(s16 *)(row+offI);
+	return sprintf(buf, "%d\n", milliamp*millivolt/1000); /* units: mW */
+}
+
+/**
+ * show_tp_ec_bat_date - decode and show a date from EC battery status data
+ * @arg0: specified 1st argument of EC raw to read
+ * @offset: byte offset in EC raw data
+ * @attr: battery attribute
+ * @buf: output buffer
+ */
+static ssize_t show_tp_ec_bat_date(u8 arg0, int offset,
+			       struct device_attribute *attr, char *buf)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	u16 v;
+	int ret;
+	int day, month, year;
+	int bat = attr_get_bat(attr);
+	if (bat_has_status(bat) != 1)
+		return -ENXIO;
+	ret = read_tp_ec_row(arg0, bat, 0, row);
+	if (ret)
+		return ret;
+
+	/* Decode bit-packed: v = day | (month<<5) | ((year-1980)<<9) */
+	v = *(u16 *)(row+offset);
+	day = v & 0x1F;
+	month = (v >> 5) & 0xF;
+	year = (v >> 9) + 1980;
+
+	return sprintf(buf, "%04d-%02d-%02d\n", year, month, day);
+}
+
+
+/*********************************************************************
+ * sysfs attribute I/O for batteries -
+ * the actual attribute show/store functions
+ */
+
+static ssize_t show_battery_start_charge_thresh(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	int thresh;
+	int bat = attr_get_bat(attr);
+	int ret = get_thresh(bat, THRESH_START, &thresh);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%d\n", thresh);  /* units: percent */
+}
+
+static ssize_t show_battery_stop_charge_thresh(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	int thresh;
+	int bat = attr_get_bat(attr);
+	int ret = get_thresh(bat, THRESH_STOP, &thresh);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%d\n", thresh);  /* units: percent */
+}
+
+/**
+ * store_battery_start_charge_thresh - store battery_start_charge_thresh attr
+ * Since this is a kernel<->user interface, we ensure a valid state for
+ * the hardware. We do this by clamping the requested threshold to the
+ * valid range and, if necessary, moving the other threshold so that
+ * it's MIN_THRESH_DELTA away from this one.
+ */
+static ssize_t store_battery_start_charge_thresh(struct device *dev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	int thresh, other_thresh, ret;
+	int bat = attr_get_bat(attr);
+
+	if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100)
+		return -EINVAL;
+
+	if (thresh < MIN_THRESH_START) /* clamp up to MIN_THRESH_START */
+		thresh = MIN_THRESH_START;
+	if (thresh > MAX_THRESH_START) /* clamp down to MAX_THRESH_START */
+		thresh = MAX_THRESH_START;
+
+	down(&smapi_mutex);
+	ret = get_thresh(bat, THRESH_STOP, &other_thresh);
+	if (ret != -EOPNOTSUPP && ret != -ENXIO) {
+		if (ret) /* other threshold is set? */
+			goto out;
+		ret = get_real_thresh(bat, THRESH_START, NULL);
+		if (ret) /* this threshold is set? */
+			goto out;
+		if (other_thresh < thresh+MIN_THRESH_DELTA) {
+			/* move other thresh to keep it above this one */
+			ret = set_thresh(bat, THRESH_STOP,
+					 thresh+MIN_THRESH_DELTA);
+			if (ret)
+				goto out;
+		}
+	}
+	ret = set_thresh(bat, THRESH_START, thresh);
+out:
+	up(&smapi_mutex);
+	return count;
+
+}
+
+/**
+ * store_battery_stop_charge_thresh - store battery_stop_charge_thresh attr
+ * Since this is a kernel<->user interface, we ensure a valid state for
+ * the hardware. We do this by clamping the requested threshold to the
+ * valid range and, if necessary, moving the other threshold so that
+ * it's MIN_THRESH_DELTA away from this one.
+ */
+static ssize_t store_battery_stop_charge_thresh(struct device *dev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	int thresh, other_thresh, ret;
+	int bat = attr_get_bat(attr);
+
+	if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100)
+		return -EINVAL;
+
+	if (thresh < MIN_THRESH_STOP) /* clamp up to MIN_THRESH_STOP */
+		thresh = MIN_THRESH_STOP;
+
+	down(&smapi_mutex);
+	ret = get_thresh(bat, THRESH_START, &other_thresh);
+	if (ret != -EOPNOTSUPP && ret != -ENXIO) { /* other threshold exists? */
+		if (ret)
+			goto out;
+		/* this threshold exists? */
+		ret = get_real_thresh(bat, THRESH_STOP, NULL);
+		if (ret)
+			goto out;
+		if (other_thresh >= thresh-MIN_THRESH_DELTA) {
+			 /* move other thresh to be below this one */
+			ret = set_thresh(bat, THRESH_START,
+					 thresh-MIN_THRESH_DELTA);
+			if (ret)
+				goto out;
+		}
+	}
+	ret = set_thresh(bat, THRESH_STOP, thresh);
+out:
+	up(&smapi_mutex);
+	return count;
+}
+
+static ssize_t show_battery_inhibit_charge_minutes(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	int minutes;
+	int bat = attr_get_bat(attr);
+	int ret = get_inhibit_charge_minutes(bat, &minutes);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%d\n", minutes);  /* units: minutes */
+}
+
+static ssize_t store_battery_inhibit_charge_minutes(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	int ret;
+	int minutes;
+	int bat = attr_get_bat(attr);
+	if (sscanf(buf, "%d", &minutes) != 1 || minutes < 0) {
+		TPRINTK(KERN_ERR, "inhibit_charge_minutes: "
+			      "must be a non-negative integer");
+		return -EINVAL;
+	}
+	ret = set_inhibit_charge_minutes(bat, minutes);
+	if (ret)
+		return ret;
+	return count;
+}
+
+static ssize_t show_battery_force_discharge(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	int enabled;
+	int bat = attr_get_bat(attr);
+	int ret = get_force_discharge(bat, &enabled);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%d\n", enabled);  /* type: boolean */
+}
+
+static ssize_t store_battery_force_discharge(struct device *dev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	int ret;
+	int enabled;
+	int bat = attr_get_bat(attr);
+	if (sscanf(buf, "%d", &enabled) != 1 || enabled < 0 || enabled > 1)
+		return -EINVAL;
+	ret = set_force_discharge(bat, enabled);
+	if (ret)
+		return ret;
+	return count;
+}
+
+static ssize_t show_battery_installed(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int bat = attr_get_bat(attr);
+	int ret = power_device_present(bat);
+	if (ret < 0)
+		return ret;
+	return sprintf(buf, "%d\n", ret); /* type: boolean */
+}
+
+static ssize_t show_battery_state(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	const char *txt;
+	int ret;
+	int bat = attr_get_bat(attr);
+	if (bat_has_status(bat) != 1)
+		return sprintf(buf, "none\n");
+	ret = read_tp_ec_row(1, bat, 0, row);
+	if (ret)
+		return ret;
+	switch (row[1] & 0xf0) {
+	case 0xc0: txt = "idle"; break;
+	case 0xd0: txt = "discharging"; break;
+	case 0xe0: txt = "charging"; break;
+	default:   return sprintf(buf, "unknown (0x%x)\n", row[1]);
+	}
+	return sprintf(buf, "%s\n", txt);  /* type: string from fixed set */
+}
+
+static ssize_t show_battery_manufacturer(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: string. SBS spec v1.1 p34: ManufacturerName() */
+	return show_tp_ec_bat_str(4, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf);
+}
+
+static ssize_t show_battery_model(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: string. SBS spec v1.1 p34: DeviceName() */
+	return show_tp_ec_bat_str(5, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf);
+}
+
+static ssize_t show_battery_barcoding(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: string */
+	return show_tp_ec_bat_str(7, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf);
+}
+
+static ssize_t show_battery_chemistry(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: string. SBS spec v1.1 p34-35: DeviceChemistry() */
+	return show_tp_ec_bat_str(6, 2, 5, attr, buf);
+}
+
+static ssize_t show_battery_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV. SBS spec v1.1 p24: Voltage() */
+	return show_tp_ec_bat_u16(1, 6, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_design_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV. SBS spec v1.1 p32: DesignVoltage() */
+	return show_tp_ec_bat_u16(3, 4, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_charging_max_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV. SBS spec v1.1 p37,39: ChargingVoltage() */
+	return show_tp_ec_bat_u16(9, 8, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_group0_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV */
+	return show_tp_ec_bat_u16(0xA, 12, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_group1_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV */
+	return show_tp_ec_bat_u16(0xA, 10, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_group2_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV */
+	return show_tp_ec_bat_u16(0xA, 8, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_group3_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV */
+	return show_tp_ec_bat_u16(0xA, 6, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_current_now(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mA. SBS spec v1.1 p24: Current() */
+	return show_tp_ec_bat_s16(1, 8, 1, 0, attr, buf);
+}
+
+static ssize_t show_battery_current_avg(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mA. SBS spec v1.1 p24: AverageCurrent() */
+	return show_tp_ec_bat_s16(1, 10, 1, 0, attr, buf);
+}
+
+static ssize_t show_battery_charging_max_current(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mA. SBS spec v1.1 p36,38: ChargingCurrent() */
+	return show_tp_ec_bat_s16(9, 6, 1, 0, attr, buf);
+}
+
+static ssize_t show_battery_power_now(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mW. SBS spec v1.1: Voltage()*Current() */
+	return show_tp_ec_bat_power(1, 6, 8, attr, buf);
+}
+
+static ssize_t show_battery_power_avg(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mW. SBS spec v1.1: Voltage()*AverageCurrent() */
+	return show_tp_ec_bat_power(1, 6, 10, attr, buf);
+}
+
+static ssize_t show_battery_remaining_percent(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: percent. SBS spec v1.1 p25: RelativeStateOfCharge() */
+	return show_tp_ec_bat_u16(1, 12, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_remaining_percent_error(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: percent. SBS spec v1.1 p25: MaxError() */
+	return show_tp_ec_bat_u16(9, 4, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_remaining_charging_time(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: minutes. SBS spec v1.1 p27: AverageTimeToFull() */
+	return show_tp_ec_bat_u16(2, 8, 1, "not_charging", attr, buf);
+}
+
+static ssize_t show_battery_remaining_running_time(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */
+	return show_tp_ec_bat_u16(2, 6, 1, "not_discharging", attr, buf);
+}
+
+static ssize_t show_battery_remaining_running_time_now(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */
+	return show_tp_ec_bat_u16(2, 4, 1, "not_discharging", attr, buf);
+}
+
+static ssize_t show_battery_remaining_capacity(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mWh. SBS spec v1.1 p26. */
+	return show_tp_ec_bat_u16(1, 14, 10, "", attr, buf);
+}
+
+static ssize_t show_battery_last_full_capacity(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mWh. SBS spec v1.1 p26: FullChargeCapacity() */
+	return show_tp_ec_bat_u16(2, 2, 10, "", attr, buf);
+}
+
+static ssize_t show_battery_design_capacity(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mWh. SBS spec v1.1 p32: DesignCapacity() */
+	return show_tp_ec_bat_u16(3, 2, 10, "", attr, buf);
+}
+
+static ssize_t show_battery_cycle_count(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: ordinal. SBS spec v1.1 p32: CycleCount() */
+	return show_tp_ec_bat_u16(2, 12, 1, "", attr, buf);
+}
+
+static ssize_t show_battery_temperature(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: millicelsius. SBS spec v1.1: Temperature()*10 */
+	return show_tp_ec_bat_s16(1, 4, 100, -273100, attr, buf);
+}
+
+static ssize_t show_battery_serial(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: int. SBS spec v1.1 p34: SerialNumber() */
+	return show_tp_ec_bat_u16(3, 10, 1, "", attr, buf);
+}
+
+static ssize_t show_battery_manufacture_date(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: YYYY-MM-DD. SBS spec v1.1 p34: ManufactureDate() */
+	return show_tp_ec_bat_date(3, 8, attr, buf);
+}
+
+static ssize_t show_battery_first_use_date(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: YYYY-MM-DD */
+	return show_tp_ec_bat_date(8, 2, attr, buf);
+}
+
+/**
+ * show_battery_dump - show the battery's dump attribute
+ * The dump attribute gives a hex dump of all EC readouts related to a
+ * battery. Some of the enumerated values don't really exist (i.e., the
+ * EC function just leaves them untouched); we use a kludge to detect and
+ * denote these.
+ */
+#define MIN_DUMP_ARG0 0x00
+#define MAX_DUMP_ARG0 0x0a /* 0x0b is useful too but hangs old EC firmware */
+static ssize_t show_battery_dump(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int i;
+	char *p = buf;
+	int bat = attr_get_bat(attr);
+	u8 arg0; /* first argument to EC */
+	u8 rowa[TP_CONTROLLER_ROW_LEN],
+	   rowb[TP_CONTROLLER_ROW_LEN];
+	const u8 junka = 0xAA,
+		 junkb = 0x55; /* junk values for testing changes */
+	int ret;
+
+	for (arg0 = MIN_DUMP_ARG0; arg0 <= MAX_DUMP_ARG0; ++arg0) {
+		if ((p-buf) > PAGE_SIZE-TP_CONTROLLER_ROW_LEN*5)
+			return -ENOMEM; /* don't overflow sysfs buf */
+		/* Read raw twice with different junk values,
+		 * to detect unused output bytes which are left unchaged: */
+		ret = read_tp_ec_row(arg0, bat, junka, rowa);
+		if (ret)
+			return ret;
+		ret = read_tp_ec_row(arg0, bat, junkb, rowb);
+		if (ret)
+			return ret;
+		for (i = 0; i < TP_CONTROLLER_ROW_LEN; i++) {
+			if (rowa[i] == junka && rowb[i] == junkb)
+				p += sprintf(p, "-- "); /* unused by EC */
+			else
+				p += sprintf(p, "%02x ", rowa[i]);
+		}
+		p += sprintf(p, "\n");
+	}
+	return p-buf;
+}
+
+
+/*********************************************************************
+ * sysfs attribute I/O, other than batteries
+ */
+
+static ssize_t show_ac_connected(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int ret = power_device_present(0xFF);
+	if (ret < 0)
+		return ret;
+	return sprintf(buf, "%d\n", ret);  /* type: boolean */
+}
+
+/*********************************************************************
+ * The the "smapi_request" sysfs attribute executes a raw SMAPI call.
+ * You write to make a request and read to get the result. The state
+ * is saved globally rather than per fd (sysfs limitation), so
+ * simultaenous requests may get each other's results! So this is for
+ * development and debugging only.
+ */
+#define MAX_SMAPI_ATTR_ANSWER_LEN   128
+static char smapi_attr_answer[MAX_SMAPI_ATTR_ANSWER_LEN] = "";
+
+static ssize_t show_smapi_request(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	int ret = snprintf(buf, PAGE_SIZE, "%s", smapi_attr_answer);
+	smapi_attr_answer[0] = '\0';
+	return ret;
+}
+
+static ssize_t store_smapi_request(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	unsigned int inEBX, inECX, inEDI, inESI;
+	u32 outEBX, outECX, outEDX, outEDI, outESI;
+	const char *msg;
+	int ret;
+	if (sscanf(buf, "%x %x %x %x", &inEBX, &inECX, &inEDI, &inESI) != 4) {
+		smapi_attr_answer[0] = '\0';
+		return -EINVAL;
+	}
+	ret = smapi_request(
+		   inEBX, inECX, inEDI, inESI,
+		   &outEBX, &outECX, &outEDX, &outEDI, &outESI, &msg);
+	snprintf(smapi_attr_answer, MAX_SMAPI_ATTR_ANSWER_LEN,
+		 "%x %x %x %x %x %d '%s'\n",
+		 (unsigned int)outEBX, (unsigned int)outECX,
+		 (unsigned int)outEDX, (unsigned int)outEDI,
+		 (unsigned int)outESI, ret, msg);
+	if (ret)
+		return ret;
+	else
+		return count;
+}
+
+/*********************************************************************
+ * Power management: the embedded controller forgets the battery
+ * thresholds when the system is suspended to disk and unplugged from
+ * AC and battery, so we restore it upon resume.
+ */
+
+static int saved_threshs[4] = {-1, -1, -1, -1};  /* -1 = don't know */
+
+static int tp_suspend(struct platform_device *dev, pm_message_t state)
+{
+	int restore = (state.event == PM_EVENT_HIBERNATE ||
+	               state.event == PM_EVENT_FREEZE);
+	if (!restore || get_real_thresh(0, THRESH_STOP , &saved_threshs[0]))
+		saved_threshs[0] = -1;
+	if (!restore || get_real_thresh(0, THRESH_START, &saved_threshs[1]))
+		saved_threshs[1] = -1;
+	if (!restore || get_real_thresh(1, THRESH_STOP , &saved_threshs[2]))
+		saved_threshs[2] = -1;
+	if (!restore || get_real_thresh(1, THRESH_START, &saved_threshs[3]))
+		saved_threshs[3] = -1;
+	DPRINTK("suspend saved: %d %d %d %d", saved_threshs[0],
+		saved_threshs[1], saved_threshs[2], saved_threshs[3]);
+	return 0;
+}
+
+static int tp_resume(struct platform_device *dev)
+{
+	DPRINTK("resume restoring: %d %d %d %d", saved_threshs[0],
+		saved_threshs[1], saved_threshs[2], saved_threshs[3]);
+	if (saved_threshs[0] >= 0)
+		set_real_thresh(0, THRESH_STOP , saved_threshs[0]);
+	if (saved_threshs[1] >= 0)
+		set_real_thresh(0, THRESH_START, saved_threshs[1]);
+	if (saved_threshs[2] >= 0)
+		set_real_thresh(1, THRESH_STOP , saved_threshs[2]);
+	if (saved_threshs[3] >= 0)
+		set_real_thresh(1, THRESH_START, saved_threshs[3]);
+	return 0;
+}
+
+
+/*********************************************************************
+ * Driver model
+ */
+
+static struct platform_driver tp_driver = {
+	.suspend = tp_suspend,
+	.resume = tp_resume,
+	.driver = {
+		.name = "smapi",
+		.owner = THIS_MODULE
+	},
+};
+
+
+/*********************************************************************
+ * Sysfs device model
+ */
+
+/* Attributes in /sys/devices/platform/smapi/ */
+
+static DEVICE_ATTR(ac_connected, 0444, show_ac_connected, NULL);
+static DEVICE_ATTR(smapi_request, 0600, show_smapi_request,
+					store_smapi_request);
+
+static struct attribute *tp_root_attributes[] = {
+	&dev_attr_ac_connected.attr,
+	&dev_attr_smapi_request.attr,
+	NULL
+};
+static struct attribute_group tp_root_attribute_group = {
+	.attrs = tp_root_attributes
+};
+
+/* Attributes under /sys/devices/platform/smapi/BAT{0,1}/ :
+ * Every attribute needs to be defined (i.e., statically allocated) for
+ * each battery, and then referenced in the attribute list of each battery.
+ * We use preprocessor voodoo to avoid duplicating the list of attributes 4
+ * times. The preprocessor output is just normal sysfs attributes code.
+ */
+
+/**
+ * FOREACH_BAT_ATTR - invoke the given macros on all our battery attributes
+ * @_BAT:     battery number (0 or 1)
+ * @_ATTR_RW: macro to invoke for each read/write attribute
+ * @_ATTR_R:  macro to invoke for each read-only  attribute
+ */
+#define FOREACH_BAT_ATTR(_BAT, _ATTR_RW, _ATTR_R) \
+	_ATTR_RW(_BAT, start_charge_thresh) \
+	_ATTR_RW(_BAT, stop_charge_thresh) \
+	_ATTR_RW(_BAT, inhibit_charge_minutes) \
+	_ATTR_RW(_BAT, force_discharge) \
+	_ATTR_R(_BAT, installed) \
+	_ATTR_R(_BAT, state) \
+	_ATTR_R(_BAT, manufacturer) \
+	_ATTR_R(_BAT, model) \
+	_ATTR_R(_BAT, barcoding) \
+	_ATTR_R(_BAT, chemistry) \
+	_ATTR_R(_BAT, voltage) \
+	_ATTR_R(_BAT, group0_voltage) \
+	_ATTR_R(_BAT, group1_voltage) \
+	_ATTR_R(_BAT, group2_voltage) \
+	_ATTR_R(_BAT, group3_voltage) \
+	_ATTR_R(_BAT, current_now) \
+	_ATTR_R(_BAT, current_avg) \
+	_ATTR_R(_BAT, charging_max_current) \
+	_ATTR_R(_BAT, power_now) \
+	_ATTR_R(_BAT, power_avg) \
+	_ATTR_R(_BAT, remaining_percent) \
+	_ATTR_R(_BAT, remaining_percent_error) \
+	_ATTR_R(_BAT, remaining_charging_time) \
+	_ATTR_R(_BAT, remaining_running_time) \
+	_ATTR_R(_BAT, remaining_running_time_now) \
+	_ATTR_R(_BAT, remaining_capacity) \
+	_ATTR_R(_BAT, last_full_capacity) \
+	_ATTR_R(_BAT, design_voltage) \
+	_ATTR_R(_BAT, charging_max_voltage) \
+	_ATTR_R(_BAT, design_capacity) \
+	_ATTR_R(_BAT, cycle_count) \
+	_ATTR_R(_BAT, temperature) \
+	_ATTR_R(_BAT, serial) \
+	_ATTR_R(_BAT, manufacture_date) \
+	_ATTR_R(_BAT, first_use_date) \
+	_ATTR_R(_BAT, dump)
+
+/* Define several macros we will feed into FOREACH_BAT_ATTR: */
+
+#define DEFINE_BAT_ATTR_RW(_BAT,_NAME) \
+	static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = {  \
+		.dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME,   \
+						store_battery_##_NAME), \
+		.bat = _BAT \
+	};
+
+#define DEFINE_BAT_ATTR_R(_BAT,_NAME) \
+	static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = {    \
+		.dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, 0), \
+		.bat = _BAT \
+	};
+
+#define REF_BAT_ATTR(_BAT,_NAME) \
+	&dev_attr_##_NAME##_##_BAT.dev_attr.attr,
+
+/* This provide all attributes for one battery: */
+
+#define PROVIDE_BAT_ATTRS(_BAT) \
+	FOREACH_BAT_ATTR(_BAT, DEFINE_BAT_ATTR_RW, DEFINE_BAT_ATTR_R) \
+	static struct attribute *tp_bat##_BAT##_attributes[] = { \
+		FOREACH_BAT_ATTR(_BAT, REF_BAT_ATTR, REF_BAT_ATTR) \
+		NULL \
+	}; \
+	static struct attribute_group tp_bat##_BAT##_attribute_group = { \
+		.name  = "BAT" #_BAT, \
+		.attrs = tp_bat##_BAT##_attributes \
+	};
+
+/* Finally genereate the attributes: */
+
+PROVIDE_BAT_ATTRS(0)
+PROVIDE_BAT_ATTRS(1)
+
+/* List of attribute groups */
+
+static struct attribute_group *attr_groups[] = {
+	&tp_root_attribute_group,
+	&tp_bat0_attribute_group,
+	&tp_bat1_attribute_group,
+	NULL
+};
+
+
+/*********************************************************************
+ * Init and cleanup
+ */
+
+static struct attribute_group **next_attr_group; /* next to register */
+
+static int __init tp_init(void)
+{
+	int ret;
+	printk(KERN_INFO "tp_smapi " TP_VERSION " loading...\n");
+
+	ret = find_smapi_port();
+	if (ret < 0)
+		goto err;
+	else
+		smapi_port = ret;
+
+	if (!request_region(smapi_port, 1, "smapi")) {
+		printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n",
+		       smapi_port);
+		ret = -ENXIO;
+		goto err;
+	}
+
+	if (!request_region(SMAPI_PORT2, 1, "smapi")) {
+		printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n",
+		       SMAPI_PORT2);
+		ret = -ENXIO;
+		goto err_port1;
+	}
+
+	ret = platform_driver_register(&tp_driver);
+	if (ret)
+		goto err_port2;
+
+	pdev = platform_device_alloc("smapi", -1);
+	if (!pdev) {
+		ret = -ENOMEM;
+		goto err_driver;
+	}
+
+	ret = platform_device_add(pdev);
+	if (ret)
+		goto err_device_free;
+
+	for (next_attr_group = attr_groups; *next_attr_group;
+	     ++next_attr_group) {
+		ret = sysfs_create_group(&pdev->dev.kobj, *next_attr_group);
+		if (ret)
+			goto err_attr;
+	}
+
+	printk(KERN_INFO "tp_smapi successfully loaded (smapi_port=0x%x).\n",
+	       smapi_port);
+	return 0;
+
+err_attr:
+	while (--next_attr_group >= attr_groups)
+		sysfs_remove_group(&pdev->dev.kobj, *next_attr_group);
+	platform_device_unregister(pdev);
+err_device_free:
+	platform_device_put(pdev);
+err_driver:
+	platform_driver_unregister(&tp_driver);
+err_port2:
+	release_region(SMAPI_PORT2, 1);
+err_port1:
+	release_region(smapi_port, 1);
+err:
+	printk(KERN_ERR "tp_smapi init failed (ret=%d)!\n", ret);
+	return ret;
+}
+
+static void __exit tp_exit(void)
+{
+	while (next_attr_group && --next_attr_group >= attr_groups)
+		sysfs_remove_group(&pdev->dev.kobj, *next_attr_group);
+	platform_device_unregister(pdev);
+	platform_driver_unregister(&tp_driver);
+	release_region(SMAPI_PORT2, 1);
+	if (smapi_port)
+		release_region(smapi_port, 1);
+
+	printk(KERN_INFO "tp_smapi unloaded.\n");
+}
+
+module_init(tp_init);
+module_exit(tp_exit);
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 5d3b86a..7f27a43 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -110,4 +110,6 @@ source "drivers/staging/wilc1000/Kconfig"
 
 source "drivers/staging/most/Kconfig"
 
+source "drivers/staging/vhba/Kconfig"
+
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 30918ed..3d47777 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_IIO)		+= iio/
 obj-$(CONFIG_FB_SM750)		+= sm750fb/
 obj-$(CONFIG_FB_XGI)		+= xgifb/
+obj-$(CONFIG_VHBA)		+= vhba/
 obj-$(CONFIG_USB_EMXX)		+= emxx_udc/
 obj-$(CONFIG_SPEAKUP)		+= speakup/
 obj-$(CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI4)	+= ste_rmi4/
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 5bb1283..f4f8f03 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -387,7 +387,7 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 			name = asma->name;
 
 		/* ... and allocate the backing shmem file */
-		vmfile = shmem_file_setup(name, asma->size, vma->vm_flags);
+		vmfile = shmem_file_setup(name, asma->size, vma->vm_flags, 0);
 		if (IS_ERR(vmfile)) {
 			ret = PTR_ERR(vmfile);
 			goto out;
diff --git b/drivers/staging/vhba/Kconfig b/drivers/staging/vhba/Kconfig
new file mode 100644
index 0000000..7ccb7d8
--- /dev/null
+++ b/drivers/staging/vhba/Kconfig
@@ -0,0 +1,9 @@
+config VHBA
+	tristate "Virtual (SCSI) Host Bus Adapter"
+	depends on SCSI
+	---help---
+        This is the in-kernel part of CDEmu, a CD/DVD-ROM device
+        emulator.
+
+	This driver can also be built as a module. If so, the module
+	will be called vhba.
diff --git b/drivers/staging/vhba/Makefile b/drivers/staging/vhba/Makefile
new file mode 100644
index 0000000..60b9e26
--- /dev/null
+++ b/drivers/staging/vhba/Makefile
@@ -0,0 +1,4 @@
+VHBA_VERSION := 20140928
+
+obj-$(CONFIG_VHBA)		+= vhba.o
+ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror
diff --git b/drivers/staging/vhba/vhba.c b/drivers/staging/vhba/vhba.c
new file mode 100644
index 0000000..4a14a10
--- /dev/null
+++ b/drivers/staging/vhba/vhba.c
@@ -0,0 +1,1071 @@
+/*
+ * vhba.c
+ *
+ * Copyright (C) 2007-2012 Chia-I Wu <b90201047 AT ntu DOT edu DOT tw>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/version.h>
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+#include <asm/uaccess.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+
+/* scatterlist.page_link and sg_page() were introduced in 2.6.24 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24)
+#define USE_SG_PAGE
+#include <linux/scatterlist.h>
+#endif
+
+MODULE_AUTHOR("Chia-I Wu");
+MODULE_VERSION(VHBA_VERSION);
+MODULE_DESCRIPTION("Virtual SCSI HBA");
+MODULE_LICENSE("GPL");
+
+#ifdef DEBUG
+#define DPRINTK(fmt, args...) printk(KERN_DEBUG "%s: " fmt, __FUNCTION__, ## args)
+#else
+#define DPRINTK(fmt, args...)
+#endif
+
+/* scmd_dbg was introduced in 3.15 */
+#ifndef scmd_dbg
+#define scmd_dbg(scmd, fmt, a...)       \
+    dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a)
+#endif
+
+#ifndef scmd_warn
+#define scmd_warn(scmd, fmt, a...)      \
+    dev_warn(&(scmd)->device->sdev_gendev, fmt, ##a)
+#endif
+
+#define VHBA_MAX_SECTORS_PER_IO 256
+#define VHBA_MAX_ID 32
+#define VHBA_CAN_QUEUE 32
+#define VHBA_INVALID_ID VHBA_MAX_ID
+
+#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL)
+#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL)
+
+
+/* SCSI macros were introduced in 2.6.23 */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23)
+#define scsi_sg_count(cmd) ((cmd)->use_sg)
+#define scsi_sglist(cmd) ((cmd)->request_buffer)
+#define scsi_bufflen(cmd) ((cmd)->request_bufflen)
+#define scsi_set_resid(cmd, to_read) {(cmd)->resid = (to_read);}
+#endif
+
+/* 1-argument form of k[un]map_atomic was introduced in 2.6.37-rc1;
+   2-argument form was deprecated in 3.4-rc1 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 37)
+#define vhba_kmap_atomic kmap_atomic
+#define vhba_kunmap_atomic kunmap_atomic
+#else
+#define vhba_kmap_atomic(page) kmap_atomic(page, KM_USER0)
+#define vhba_kunmap_atomic(page) kunmap_atomic(page, KM_USER0)
+#endif
+
+
+enum vhba_req_state {
+    VHBA_REQ_FREE,
+    VHBA_REQ_PENDING,
+    VHBA_REQ_READING,
+    VHBA_REQ_SENT,
+    VHBA_REQ_WRITING,
+};
+
+struct vhba_command {
+    struct scsi_cmnd *cmd;
+    int status;
+    struct list_head entry;
+};
+
+struct vhba_device {
+    uint id;
+    spinlock_t cmd_lock;
+    struct list_head cmd_list;
+    wait_queue_head_t cmd_wq;
+    atomic_t refcnt;
+};
+
+struct vhba_host {
+    struct Scsi_Host *shost;
+    spinlock_t cmd_lock;
+    int cmd_next;
+    struct vhba_command commands[VHBA_CAN_QUEUE];
+    spinlock_t dev_lock;
+    struct vhba_device *devices[VHBA_MAX_ID];
+    int num_devices;
+    DECLARE_BITMAP(chgmap, VHBA_MAX_ID);
+    int chgtype[VHBA_MAX_ID];
+    struct work_struct scan_devices;
+};
+
+#define MAX_COMMAND_SIZE 16
+
+struct vhba_request {
+    __u32 tag;
+    __u32 lun;
+    __u8 cdb[MAX_COMMAND_SIZE];
+    __u8 cdb_len;
+    __u32 data_len;
+};
+
+struct vhba_response {
+    __u32 tag;
+    __u32 status;
+    __u32 data_len;
+};
+
+static struct vhba_command *vhba_alloc_command (void);
+static void vhba_free_command (struct vhba_command *vcmd);
+
+static struct platform_device vhba_platform_device;
+
+static struct vhba_device *vhba_device_alloc (void)
+{
+    struct vhba_device *vdev;
+
+    vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL);
+    if (!vdev) {
+        return NULL;
+    }
+
+    vdev->id = VHBA_INVALID_ID;
+    spin_lock_init(&vdev->cmd_lock);
+    INIT_LIST_HEAD(&vdev->cmd_list);
+    init_waitqueue_head(&vdev->cmd_wq);
+    atomic_set(&vdev->refcnt, 1);
+
+    return vdev;
+}
+
+static void vhba_device_put (struct vhba_device *vdev)
+{
+    if (atomic_dec_and_test(&vdev->refcnt)) {
+        kfree(vdev);
+    }
+}
+
+static struct vhba_device *vhba_device_get (struct vhba_device *vdev)
+{
+    atomic_inc(&vdev->refcnt);
+
+    return vdev;
+}
+
+static int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
+{
+    struct vhba_command *vcmd;
+    unsigned long flags;
+
+    vcmd = vhba_alloc_command();
+    if (!vcmd) {
+        return SCSI_MLQUEUE_HOST_BUSY;
+    }
+
+    vcmd->cmd = cmd;
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    list_add_tail(&vcmd->entry, &vdev->cmd_list);
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    wake_up_interruptible(&vdev->cmd_wq);
+
+    return 0;
+}
+
+static int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
+{
+    struct vhba_command *vcmd;
+    int retval;
+    unsigned long flags;
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        if (vcmd->cmd == cmd) {
+            list_del_init(&vcmd->entry);
+            break;
+        }
+    }
+
+    /* command not found */
+    if (&vcmd->entry == &vdev->cmd_list) {
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+        return SUCCESS;
+    }
+
+    while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) {
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+        scmd_dbg(cmd, "wait for I/O before aborting\n");
+        schedule_timeout(1);
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+    }
+
+    retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS;
+
+    vhba_free_command(vcmd);
+
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return retval;
+}
+
+static inline void vhba_scan_devices_add (struct vhba_host *vhost, int id)
+{
+    struct scsi_device *sdev;
+
+    sdev = scsi_device_lookup(vhost->shost, 0, id, 0);
+    if (!sdev) {
+        scsi_add_device(vhost->shost, 0, id, 0);
+    } else {
+        dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device 0:%d:0!\n", id);
+        scsi_device_put(sdev);
+    }
+}
+
+static inline void vhba_scan_devices_remove (struct vhba_host *vhost, int id)
+{
+    struct scsi_device *sdev;
+
+    sdev = scsi_device_lookup(vhost->shost, 0, id, 0);
+    if (sdev) {
+        scsi_remove_device(sdev);
+        scsi_device_put(sdev);
+    } else {
+        dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device 0:%d:0!\n", id);
+    }
+}
+
+static void vhba_scan_devices (struct work_struct *work)
+{
+    struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices);
+    unsigned long flags;
+    int id, change, exists;
+
+    while (1) {
+        spin_lock_irqsave(&vhost->dev_lock, flags);
+
+        id = find_first_bit(vhost->chgmap, vhost->shost->max_id);
+        if (id >= vhost->shost->max_id) {
+            spin_unlock_irqrestore(&vhost->dev_lock, flags);
+            break;
+        }
+        change = vhost->chgtype[id];
+        exists = vhost->devices[id] != NULL;
+
+        vhost->chgtype[id] = 0;
+        clear_bit(id, vhost->chgmap);
+
+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
+
+        if (change < 0) {
+            dev_dbg(&vhost->shost->shost_gendev, "trying to remove target 0:%d:0\n", id);
+            vhba_scan_devices_remove(vhost, id);
+        } else if (change > 0) {
+            dev_dbg(&vhost->shost->shost_gendev, "trying to add target 0:%d:0\n", id);
+            vhba_scan_devices_add(vhost, id);
+        } else {
+            /* quick sequence of add/remove or remove/add; we determine
+               which one it was by checking if device structure exists */
+            if (exists) {
+                /* remove followed by add: remove and (re)add */
+                dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target 0:%d:0\n", id);
+                vhba_scan_devices_remove(vhost, id);
+                vhba_scan_devices_add(vhost, id);
+            } else {
+                /* add followed by remove: no-op */
+                dev_dbg(&vhost->shost->shost_gendev, "no-op for target 0:%d:0\n", id);
+            }
+        }
+    }
+}
+
+static int vhba_add_device (struct vhba_device *vdev)
+{
+    struct vhba_host *vhost;
+    int i;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    vhba_device_get(vdev);
+
+    spin_lock_irqsave(&vhost->dev_lock, flags);
+    if (vhost->num_devices >= vhost->shost->max_id) {
+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
+        vhba_device_put(vdev);
+        return -EBUSY;
+    }
+
+    for (i = 0; i < vhost->shost->max_id; i++) {
+        if (vhost->devices[i] == NULL) {
+            vdev->id = i;
+            vhost->devices[i] = vdev;
+            vhost->num_devices++;
+            set_bit(vdev->id, vhost->chgmap);
+            vhost->chgtype[vdev->id]++;
+            break;
+        }
+    }
+    spin_unlock_irqrestore(&vhost->dev_lock, flags);
+
+    schedule_work(&vhost->scan_devices);
+
+    return 0;
+}
+
+static int vhba_remove_device (struct vhba_device *vdev)
+{
+    struct vhba_host *vhost;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    spin_lock_irqsave(&vhost->dev_lock, flags);
+    set_bit(vdev->id, vhost->chgmap);
+    vhost->chgtype[vdev->id]--;
+    vhost->devices[vdev->id] = NULL;
+    vhost->num_devices--;
+    vdev->id = VHBA_INVALID_ID;
+    spin_unlock_irqrestore(&vhost->dev_lock, flags);
+
+    vhba_device_put(vdev);
+
+    schedule_work(&vhost->scan_devices);
+
+    return 0;
+}
+
+static struct vhba_device *vhba_lookup_device (int id)
+{
+    struct vhba_host *vhost;
+    struct vhba_device *vdev = NULL;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    if (likely(id < vhost->shost->max_id)) {
+        spin_lock_irqsave(&vhost->dev_lock, flags);
+        vdev = vhost->devices[id];
+        if (vdev) {
+            vdev = vhba_device_get(vdev);
+        }
+
+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
+    }
+
+    return vdev;
+}
+
+static struct vhba_command *vhba_alloc_command (void)
+{
+    struct vhba_host *vhost;
+    struct vhba_command *vcmd;
+    unsigned long flags;
+    int i;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    spin_lock_irqsave(&vhost->cmd_lock, flags);
+
+    vcmd = vhost->commands + vhost->cmd_next++;
+    if (vcmd->status != VHBA_REQ_FREE) {
+        for (i = 0; i < vhost->shost->can_queue; i++) {
+            vcmd = vhost->commands + i;
+
+            if (vcmd->status == VHBA_REQ_FREE) {
+                vhost->cmd_next = i + 1;
+                break;
+            }
+        }
+
+        if (i == vhost->shost->can_queue) {
+            vcmd = NULL;
+        }
+    }
+
+    if (vcmd) {
+        vcmd->status = VHBA_REQ_PENDING;
+    }
+
+    vhost->cmd_next %= vhost->shost->can_queue;
+
+    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
+
+    return vcmd;
+}
+
+static void vhba_free_command (struct vhba_command *vcmd)
+{
+    struct vhba_host *vhost;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    spin_lock_irqsave(&vhost->cmd_lock, flags);
+    vcmd->status = VHBA_REQ_FREE;
+    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
+}
+
+static int vhba_queuecommand_lck (struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
+{
+    struct vhba_device *vdev;
+    int retval;
+
+    scmd_dbg(cmd, "queue %lu\n", cmd->serial_number);
+
+    vdev = vhba_lookup_device(cmd->device->id);
+    if (!vdev) {
+        scmd_dbg(cmd, "no such device\n");
+
+        cmd->result = DID_NO_CONNECT << 16;
+        done(cmd);
+
+        return 0;
+    }
+
+    cmd->scsi_done = done;
+    retval = vhba_device_queue(vdev, cmd);
+
+    vhba_device_put(vdev);
+
+    return retval;
+}
+
+#ifdef DEF_SCSI_QCMD
+DEF_SCSI_QCMD(vhba_queuecommand)
+#else
+#define vhba_queuecommand vhba_queuecommand_lck
+#endif
+
+static int vhba_abort (struct scsi_cmnd *cmd)
+{
+    struct vhba_device *vdev;
+    int retval = SUCCESS;
+
+    scmd_warn(cmd, "abort %lu\n", cmd->serial_number);
+
+    vdev = vhba_lookup_device(cmd->device->id);
+    if (vdev) {
+        retval = vhba_device_dequeue(vdev, cmd);
+        vhba_device_put(vdev);
+    } else {
+        cmd->result = DID_NO_CONNECT << 16;
+    }
+
+    return retval;
+}
+
+static struct scsi_host_template vhba_template = {
+    .module = THIS_MODULE,
+    .name = "vhba",
+    .proc_name = "vhba",
+    .queuecommand = vhba_queuecommand,
+    .eh_abort_handler = vhba_abort,
+    .can_queue = VHBA_CAN_QUEUE,
+    .this_id = -1,
+    .cmd_per_lun = 1,
+    .max_sectors = VHBA_MAX_SECTORS_PER_IO,
+    .sg_tablesize = 256,
+};
+
+static ssize_t do_request (struct scsi_cmnd *cmd, char __user *buf, size_t buf_len)
+{
+    struct vhba_request vreq;
+    ssize_t ret;
+
+    scmd_dbg(cmd, "request %lu, cdb 0x%x, bufflen %d, use_sg %d\n",
+        cmd->serial_number, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd));
+
+    ret = sizeof(vreq);
+    if (DATA_TO_DEVICE(cmd->sc_data_direction)) {
+        ret += scsi_bufflen(cmd);
+    }
+
+    if (ret > buf_len) {
+        scmd_warn(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret);
+        return -EIO;
+    }
+
+    vreq.tag = cmd->serial_number;
+    vreq.lun = cmd->device->lun;
+    memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE);
+    vreq.cdb_len = cmd->cmd_len;
+    vreq.data_len = scsi_bufflen(cmd);
+
+    if (copy_to_user(buf, &vreq, sizeof(vreq))) {
+        return -EFAULT;
+    }
+
+    if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) {
+        buf += sizeof(vreq);
+
+        if (scsi_sg_count(cmd)) {
+            unsigned char buf_stack[64];
+            unsigned char *kaddr, *uaddr, *kbuf;
+            struct scatterlist *sg = scsi_sglist(cmd);
+            int i;
+
+            uaddr = (unsigned char *) buf;
+
+            if (vreq.data_len > 64) {
+                kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+            } else {
+                kbuf = buf_stack;
+            }
+
+            for (i = 0; i < scsi_sg_count(cmd); i++) {
+                size_t len = sg[i].length;
+
+#ifdef USE_SG_PAGE
+                kaddr = vhba_kmap_atomic(sg_page(&sg[i]));
+#else
+                kaddr = vhba_kmap_atomic(sg[i].page);
+#endif
+                memcpy(kbuf, kaddr + sg[i].offset, len);
+                vhba_kunmap_atomic(kaddr);
+
+                if (copy_to_user(uaddr, kbuf, len)) {
+                    if (kbuf != buf_stack) {
+                        kfree(kbuf);
+                    }
+                    return -EFAULT;
+                }
+                uaddr += len;
+            }
+
+            if (kbuf != buf_stack) {
+                kfree(kbuf);
+            }
+        } else {
+            if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) {
+                return -EFAULT;
+            }
+        }
+    }
+
+    return ret;
+}
+
+static ssize_t do_response (struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res)
+{
+    ssize_t ret = 0;
+
+    scmd_dbg(cmd, "response %lu, status %x, data len %d, use_sg %d\n",
+         cmd->serial_number, res->status, res->data_len, scsi_sg_count(cmd));
+
+    if (res->status) {
+        unsigned char sense_stack[SCSI_SENSE_BUFFERSIZE];
+
+        if (res->data_len > SCSI_SENSE_BUFFERSIZE) {
+            scmd_warn(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len);
+            res->data_len = SCSI_SENSE_BUFFERSIZE;
+        }
+
+        /* Copy via temporary buffer on stack in order to avoid problems
+           with PAX on grsecurity-enabled kernels */
+        if (copy_from_user(sense_stack, buf, res->data_len)) {
+            return -EFAULT;
+        }
+        memcpy(cmd->sense_buffer, sense_stack, res->data_len);
+
+        cmd->result = res->status;
+
+        ret += res->data_len;
+    } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) {
+        size_t to_read;
+
+        if (res->data_len > scsi_bufflen(cmd)) {
+            scmd_warn(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len);
+            res->data_len = scsi_bufflen(cmd);
+        }
+
+        to_read = res->data_len;
+
+        if (scsi_sg_count(cmd)) {
+            unsigned char buf_stack[64];
+            unsigned char *kaddr, *uaddr, *kbuf;
+            struct scatterlist *sg = scsi_sglist(cmd);
+            int i;
+
+            uaddr = (unsigned char *)buf;
+
+            if (res->data_len > 64) {
+                kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+            } else {
+                kbuf = buf_stack;
+            }
+
+            for (i = 0; i < scsi_sg_count(cmd); i++) {
+                size_t len = (sg[i].length < to_read) ? sg[i].length : to_read;
+
+                if (copy_from_user(kbuf, uaddr, len)) {
+                    if (kbuf != buf_stack) {
+                        kfree(kbuf);
+                    }
+                    return -EFAULT;
+                }
+                uaddr += len;
+
+#ifdef USE_SG_PAGE
+                kaddr = vhba_kmap_atomic(sg_page(&sg[i]));
+#else
+                kaddr = vhba_kmap_atomic(sg[i].page);
+#endif
+                memcpy(kaddr + sg[i].offset, kbuf, len);
+                vhba_kunmap_atomic(kaddr);
+
+                to_read -= len;
+                if (to_read == 0) {
+                    break;
+                }
+            }
+
+            if (kbuf != buf_stack) {
+                kfree(kbuf);
+            }
+        } else {
+            if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) {
+                return -EFAULT;
+            }
+
+            to_read -= res->data_len;
+        }
+
+        scsi_set_resid(cmd, to_read);
+
+        ret += res->data_len - to_read;
+    }
+
+    return ret;
+}
+
+static inline struct vhba_command *next_command (struct vhba_device *vdev)
+{
+    struct vhba_command *vcmd;
+
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        if (vcmd->status == VHBA_REQ_PENDING) {
+            break;
+        }
+    }
+
+    if (&vcmd->entry == &vdev->cmd_list) {
+        vcmd = NULL;
+    }
+
+    return vcmd;
+}
+
+static inline struct vhba_command *match_command (struct vhba_device *vdev, u32 tag)
+{
+    struct vhba_command *vcmd;
+
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        if (vcmd->cmd->serial_number == tag) {
+            break;
+        }
+    }
+
+    if (&vcmd->entry == &vdev->cmd_list) {
+        vcmd = NULL;
+    }
+
+    return vcmd;
+}
+
+static struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags)
+{
+    struct vhba_command *vcmd;
+    DEFINE_WAIT(wait);
+
+    while (!(vcmd = next_command(vdev))) {
+        if (signal_pending(current)) {
+            break;
+        }
+
+        prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE);
+
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+        schedule();
+
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+    }
+
+    finish_wait(&vdev->cmd_wq, &wait);
+    if (vcmd) {
+        vcmd->status = VHBA_REQ_READING;
+    }
+
+    return vcmd;
+}
+
+static ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset)
+{
+    struct vhba_device *vdev;
+    struct vhba_command *vcmd;
+    ssize_t ret;
+    unsigned long flags;
+
+    vdev = file->private_data;
+
+    /* Get next command */
+    if (file->f_flags & O_NONBLOCK) {
+        /* Non-blocking variant */
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+        vcmd = next_command(vdev);
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+        if (!vcmd) {
+            return -EWOULDBLOCK;
+        }
+    } else {
+        /* Blocking variant */
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+        vcmd = wait_command(vdev, flags);
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+        if (!vcmd) {
+            return -ERESTARTSYS;
+        }
+    }
+
+    ret = do_request(vcmd->cmd, buf, buf_len);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    if (ret >= 0) {
+        vcmd->status = VHBA_REQ_SENT;
+        *offset += ret;
+    } else {
+        vcmd->status = VHBA_REQ_PENDING;
+    }
+
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return ret;
+}
+
+static ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset)
+{
+    struct vhba_device *vdev;
+    struct vhba_command *vcmd;
+    struct vhba_response res;
+    ssize_t ret;
+    unsigned long flags;
+
+    if (buf_len < sizeof(res)) {
+        return -EIO;
+    }
+
+    if (copy_from_user(&res, buf, sizeof(res))) {
+        return -EFAULT;
+    }
+
+    vdev = file->private_data;
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    vcmd = match_command(vdev, res.tag);
+    if (!vcmd || vcmd->status != VHBA_REQ_SENT) {
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+        DPRINTK("not expecting response\n");
+        return -EIO;
+    }
+    vcmd->status = VHBA_REQ_WRITING;
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    ret = do_response(vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    if (ret >= 0) {
+        vcmd->cmd->scsi_done(vcmd->cmd);
+        ret += sizeof(res);
+
+        /* don't compete with vhba_device_dequeue */
+        if (!list_empty(&vcmd->entry)) {
+            list_del_init(&vcmd->entry);
+            vhba_free_command(vcmd);
+        }
+    } else {
+        vcmd->status = VHBA_REQ_SENT;
+    }
+
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return ret;
+}
+
+static long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
+{
+    struct vhba_device *vdev = file->private_data;
+    struct vhba_host *vhost;
+    struct scsi_device *sdev;
+
+    switch (cmd) {
+        case 0xBEEF001: {
+            vhost = platform_get_drvdata(&vhba_platform_device);
+            sdev = scsi_device_lookup(vhost->shost, 0, vdev->id, 0);
+
+            if (sdev) {
+                int id[4] = {
+                    sdev->host->host_no,
+                    sdev->channel,
+                    sdev->id,
+                    sdev->lun
+                };
+
+                scsi_device_put(sdev);
+
+                if (copy_to_user((void *)arg, id, sizeof(id))) {
+                    return -EFAULT;
+                }
+
+                return 0;
+            } else {
+                return -ENODEV;
+            }
+        }
+    }
+
+    return -ENOTTY;
+}
+
+#ifdef CONFIG_COMPAT
+static long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
+{
+    unsigned long compat_arg = (unsigned long)compat_ptr(arg);
+    return vhba_ctl_ioctl(file, cmd, compat_arg);
+}
+#endif
+
+static unsigned int vhba_ctl_poll (struct file *file, poll_table *wait)
+{
+    struct vhba_device *vdev = file->private_data;
+    unsigned int mask = 0;
+    unsigned long flags;
+
+    poll_wait(file, &vdev->cmd_wq, wait);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    if (next_command(vdev)) {
+        mask |= POLLIN | POLLRDNORM;
+    }
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return mask;
+}
+
+static int vhba_ctl_open (struct inode *inode, struct file *file)
+{
+    struct vhba_device *vdev;
+    int retval;
+
+    DPRINTK("open\n");
+
+    /* check if vhba is probed */
+    if (!platform_get_drvdata(&vhba_platform_device)) {
+        return -ENODEV;
+    }
+
+    vdev = vhba_device_alloc();
+    if (!vdev) {
+        return -ENOMEM;
+    }
+
+    if (!(retval = vhba_add_device(vdev))) {
+        file->private_data = vdev;
+    }
+
+    vhba_device_put(vdev);
+
+    return retval;
+}
+
+static int vhba_ctl_release (struct inode *inode, struct file *file)
+{
+    struct vhba_device *vdev;
+    struct vhba_command *vcmd;
+    unsigned long flags;
+
+    DPRINTK("release\n");
+
+    vdev = file->private_data;
+
+    vhba_device_get(vdev);
+    vhba_remove_device(vdev);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING);
+
+        scmd_warn(vcmd->cmd, "device released with command %lu\n", vcmd->cmd->serial_number);
+        vcmd->cmd->result = DID_NO_CONNECT << 16;
+        vcmd->cmd->scsi_done(vcmd->cmd);
+
+        vhba_free_command(vcmd);
+    }
+    INIT_LIST_HEAD(&vdev->cmd_list);
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    vhba_device_put(vdev);
+
+    return 0;
+}
+
+static struct file_operations vhba_ctl_fops = {
+    .owner = THIS_MODULE,
+    .open = vhba_ctl_open,
+    .release = vhba_ctl_release,
+    .read = vhba_ctl_read,
+    .write = vhba_ctl_write,
+    .poll = vhba_ctl_poll,
+    .unlocked_ioctl = vhba_ctl_ioctl,
+#ifdef CONFIG_COMPAT
+    .compat_ioctl = vhba_ctl_compat_ioctl,
+#endif
+};
+
+static struct miscdevice vhba_miscdev = {
+    .minor = MISC_DYNAMIC_MINOR,
+    .name = "vhba_ctl",
+    .fops = &vhba_ctl_fops,
+};
+
+static int vhba_probe (struct platform_device *pdev)
+{
+    struct Scsi_Host *shost;
+    struct vhba_host *vhost;
+    int i;
+
+    shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host));
+    if (!shost) {
+        return -ENOMEM;
+    }
+
+    shost->max_id = VHBA_MAX_ID;
+    /* we don't support lun > 0 */
+    shost->max_lun = 1;
+    shost->max_cmd_len = MAX_COMMAND_SIZE;
+
+    vhost = (struct vhba_host *)shost->hostdata;
+    memset(vhost, 0, sizeof(*vhost));
+
+    vhost->shost = shost;
+    vhost->num_devices = 0;
+    spin_lock_init(&vhost->dev_lock);
+    spin_lock_init(&vhost->cmd_lock);
+    INIT_WORK(&vhost->scan_devices, vhba_scan_devices);
+    vhost->cmd_next = 0;
+    for (i = 0; i < vhost->shost->can_queue; i++) {
+        vhost->commands[i].status = VHBA_REQ_FREE;
+    }
+
+    platform_set_drvdata(pdev, vhost);
+
+    if (scsi_add_host(shost, &pdev->dev)) {
+        scsi_host_put(shost);
+        return -ENOMEM;
+    }
+
+    return 0;
+}
+
+static int vhba_remove (struct platform_device *pdev)
+{
+    struct vhba_host *vhost;
+    struct Scsi_Host *shost;
+
+    vhost = platform_get_drvdata(pdev);
+    shost = vhost->shost;
+
+    scsi_remove_host(shost);
+    scsi_host_put(shost);
+
+    return 0;
+}
+
+static void vhba_release (struct device * dev)
+{
+    return;
+}
+
+static struct platform_device vhba_platform_device = {
+    .name = "vhba",
+    .id = -1,
+    .dev = {
+        .release = vhba_release,
+    },
+};
+
+static struct platform_driver vhba_platform_driver = {
+    .driver = {
+        .owner = THIS_MODULE,
+        .name = "vhba",
+    },
+    .probe = vhba_probe,
+    .remove = vhba_remove,
+};
+
+static int __init vhba_init (void)
+{
+    int ret;
+
+    ret = platform_device_register(&vhba_platform_device);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = platform_driver_register(&vhba_platform_driver);
+    if (ret < 0) {
+        platform_device_unregister(&vhba_platform_device);
+        return ret;
+    }
+
+    ret = misc_register(&vhba_miscdev);
+    if (ret < 0) {
+        platform_driver_unregister(&vhba_platform_driver);
+        platform_device_unregister(&vhba_platform_device);
+        return ret;
+    }
+
+    return 0;
+}
+
+static void __exit vhba_exit(void)
+{
+    misc_deregister(&vhba_miscdev);
+    platform_driver_unregister(&vhba_platform_driver);
+    platform_device_unregister(&vhba_platform_device);
+}
+
+module_init(vhba_init);
+module_exit(vhba_exit);
+
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index c01f450..6ba1451 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -75,6 +75,124 @@ config VT_CONSOLE_SLEEP
 	def_bool y
 	depends on VT_CONSOLE && PM_SLEEP
 
+menuconfig VT_CKO
+	bool "Colored kernel message output"
+	depends on VT_CONSOLE
+	---help---
+	  This option enables kernel messages to be emitted in
+	  colors other than the default.
+
+	  The color value you need to enter is composed (OR-ed)
+	  of a foreground and a background color.
+
+	  Foreground:
+	  0x00 = black,   0x08 = dark gray,
+	  0x01 = red,     0x09 = light red,
+	  0x02 = green,   0x0A = light green,
+	  0x03 = brown,   0x0B = yellow,
+	  0x04 = blue,    0x0C = light blue,
+	  0x05 = magenta, 0x0D = light magenta,
+	  0x06 = cyan,    0x0E = light cyan,
+	  0x07 = gray,    0x0F = white,
+
+	  (Foreground colors 0x08 to 0x0F do not work when a VGA
+	  console font with 512 glyphs is used.)
+
+	  Background:
+	  0x00 = black,   0x40 = blue,
+	  0x10 = red,     0x50 = magenta,
+	  0x20 = green,   0x60 = cyan,
+	  0x30 = brown,   0x70 = gray,
+
+	  For example, 0x1F would yield white on red.
+
+	  If unsure, say N.
+
+config VT_PRINTK_EMERG_COLOR
+	hex "Emergency messages color"
+	range 0x00 0xFF
+	depends on VT_CKO
+	default 0x07
+	---help---
+	  This option defines with which color kernel emergency messages will
+	  be printed to the console.
+
+config VT_PRINTK_ALERT_COLOR
+	hex "Alert messages color"
+	range 0x00 0xFF
+	depends on VT_CKO
+	default 0x07
+	---help---
+	  This option defines with which color kernel alert messages will
+	  be printed to the console.
+
+config VT_PRINTK_CRIT_COLOR
+	hex "Critical messages color"
+	range 0x00 0xFF
+	depends on VT_CKO
+	default 0x07
+	---help---
+	  This option defines with which color kernel critical messages will
+	  be printed to the console.
+
+config VT_PRINTK_ERR_COLOR
+	hex "Error messages color"
+	range 0x00 0xFF
+	depends on VT_CKO
+	default 0x07
+	---help---
+	  This option defines with which color kernel error messages will
+	  be printed to the console.
+
+config VT_PRINTK_WARNING_COLOR
+	hex "Warning messages color"
+	range 0x00 0xFF
+	depends on VT_CKO
+	default 0x07
+	---help---
+	  This option defines with which color kernel warning messages will
+	  be printed to the console.
+
+config VT_PRINTK_NOTICE_COLOR
+	hex "Notice messages color"
+	range 0x00 0xFF
+	depends on VT_CKO
+	default 0x07
+	---help---
+	  This option defines with which color kernel notice messages will
+	  be printed to the console.
+
+config VT_PRINTK_INFO_COLOR
+	hex "Information messages color"
+	range 0x00 0xFF
+	depends on VT_CKO
+	default 0x07
+	---help---
+	  This option defines with which color kernel information messages will
+	  be printed to the console.
+
+config VT_PRINTK_DEBUG_COLOR
+	hex "Debug messages color"
+	range 0x00 0xFF
+	depends on VT_CKO
+	default 0x07
+	---help---
+	  This option defines with which color kernel debug messages will
+	  be printed to the console.
+
+config NR_TTY_DEVICES
+        int "Maximum tty device number"
+        depends on VT
+        range 12 63
+        default 63
+        ---help---
+          This option is used to change the number of tty devices in /dev.
+          The default value is 63. The lowest number you can set is 12,
+          63 is also the upper limit so we don't overrun the serial
+          consoles.
+
+          If unsure, say 63.
+
 config HW_CONSOLE
 	bool
 	depends on VT && !UML
diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index e46d628..3155b46 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -141,7 +141,7 @@ static uint32_t vtermnos[MAX_NR_HVC_CONSOLES] =
  */
 
 static void hvc_console_print(struct console *co, const char *b,
-			      unsigned count)
+			      unsigned count, unsigned loglevel)
 {
 	char c[N_OUTBUF] __ALIGNED__;
 	unsigned i = 0, n = 0;
diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c
index fa816b7..f9da806 100644
--- a/drivers/tty/hvc/hvc_xen.c
+++ b/drivers/tty/hvc/hvc_xen.c
@@ -599,7 +599,7 @@ console_initcall(xen_cons_init);
 
 #ifdef CONFIG_EARLY_PRINTK
 static void xenboot_write_console(struct console *console, const char *string,
-				  unsigned len)
+				  unsigned len, unsigned loglevel)
 {
 	unsigned int linelen, off = 0;
 	const char *pos;
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index c9720a9..279a842 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -587,11 +587,11 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
 #ifdef CONFIG_SERIAL_8250_CONSOLE
 
 static void univ8250_console_write(struct console *co, const char *s,
-				   unsigned int count)
+				   unsigned int count, unsigned int loglevel)
 {
 	struct uart_8250_port *up = &serial8250_ports[co->index];
 
-	serial8250_console_write(up, s, count);
+	serial8250_console_write(up, s, count, loglevel);
 }
 
 static int univ8250_console_setup(struct console *co, char *options)
diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c
index af62131..2aa1513 100644
--- a/drivers/tty/serial/8250/8250_early.c
+++ b/drivers/tty/serial/8250/8250_early.c
@@ -93,7 +93,7 @@ static void __init serial_putc(struct uart_port *port, int c)
 }
 
 static void __init early_serial8250_write(struct console *console,
-					const char *s, unsigned int count)
+          const char *s, unsigned int count, unsigned int loglevel)
 {
 	struct earlycon_device *device = console->data;
 	struct uart_port *port = &device->port;
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 720b946..b1a6ba3 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -2852,7 +2852,7 @@ static void serial8250_console_restore(struct uart_8250_port *up)
  *	The console_lock must be held when we get here.
  */
 void serial8250_console_write(struct uart_8250_port *up, const char *s,
-			      unsigned int count)
+			      unsigned int count, unsigned int loglevel)
 {
 	struct uart_port *port = &up->port;
 	unsigned long flags;
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index bd51bdd..8e550a6 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -71,6 +71,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/tty.h>
@@ -2531,16 +2532,44 @@ int vt_kmsg_redirect(int new)
 		return kmsg_con;
 }
 
+#ifdef CONFIG_VT_CKO
+static unsigned int printk_color[8] __read_mostly = {
+	CONFIG_VT_PRINTK_EMERG_COLOR,	/* KERN_EMERG */
+	CONFIG_VT_PRINTK_ALERT_COLOR,	/* KERN_ALERT */
+	CONFIG_VT_PRINTK_CRIT_COLOR,	/* KERN_CRIT */
+	CONFIG_VT_PRINTK_ERR_COLOR,	/* KERN_ERR */
+	CONFIG_VT_PRINTK_WARNING_COLOR,	/* KERN_WARNING */
+	CONFIG_VT_PRINTK_NOTICE_COLOR,	/* KERN_NOTICE */
+	CONFIG_VT_PRINTK_INFO_COLOR,	/* KERN_INFO */
+	CONFIG_VT_PRINTK_DEBUG_COLOR,	/* KERN_DEBUG */
+};
+module_param_array(printk_color, uint, NULL, S_IRUGO | S_IWUSR);
+
+static inline void vc_set_color(struct vc_data *vc, unsigned char color)
+{
+	vc->vc_color = color_table[color & 0xF] |
+	               (color_table[(color >> 4) & 0x7] << 4) |
+	               (color & 0x80);
+	update_attr(vc);
+}
+#else
+static unsigned int printk_color[8];
+static inline void vc_set_color(const struct vc_data *vc, unsigned char c)
+{
+}
+#endif
+
 /*
  *	Console on virtual terminal
  *
  * The console must be locked when we get here.
  */
 
-static void vt_console_print(struct console *co, const char *b, unsigned count)
+static void vt_console_print(struct console *co, const char *b, unsigned count,
+			     unsigned int loglevel)
 {
 	struct vc_data *vc = vc_cons[fg_console].d;
-	unsigned char c;
+	unsigned char current_color, c;
 	static DEFINE_SPINLOCK(printing_lock);
 	const ushort *start;
 	ushort cnt = 0;
@@ -2576,11 +2605,20 @@ static void vt_console_print(struct console *co, const char *b, unsigned count)
 
 	start = (ushort *)vc->vc_pos;
 
+	/*
+	 * We always get a valid loglevel - <8> and "no level" is transformed
+	 * to <4> in the typical kernel.
+	 */
+	current_color = printk_color[loglevel];
+	vc_set_color(vc, current_color);
+
+
 	/* Contrived structure to try to emulate original need_wrap behaviour
 	 * Problems caused when we have need_wrap set on '\n' character */
 	while (count--) {
 		c = *b++;
 		if (c == 10 || c == 13 || c == 8 || vc->vc_need_wrap) {
+			vc_set_color(vc, vc->vc_def_color);
 			if (cnt > 0) {
 				if (CON_IS_VISIBLE(vc))
 					vc->vc_sw->con_putcs(vc, start, cnt, vc->vc_y, vc->vc_x);
@@ -2593,6 +2631,7 @@ static void vt_console_print(struct console *co, const char *b, unsigned count)
 				bs(vc);
 				start = (ushort *)vc->vc_pos;
 				myx = vc->vc_x;
+				vc_set_color(vc, current_color);
 				continue;
 			}
 			if (c != 13)
@@ -2600,6 +2639,7 @@ static void vt_console_print(struct console *co, const char *b, unsigned count)
 			cr(vc);
 			start = (ushort *)vc->vc_pos;
 			myx = vc->vc_x;
+			vc_set_color(vc, current_color);
 			if (c == 10 || c == 13)
 				continue;
 		}
@@ -2622,6 +2662,7 @@ static void vt_console_print(struct console *co, const char *b, unsigned count)
 			vc->vc_need_wrap = 1;
 		}
 	}
+	vc_set_color(vc, vc->vc_def_color);
 	set_cursor(vc);
 	notify_update(vc);
 
diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c
index 6af145f..5a76e4a 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -1514,8 +1514,13 @@ void gserial_disconnect(struct gserial *gser)
 	gser->ioport = NULL;
 	if (port->port.count > 0 || port->openclose) {
 		wake_up_interruptible(&port->drain_wait);
+#if 0
 		if (port->port.tty)
 			tty_hangup(port->port.tty);
+#else
+		if (port->port.tty)
+			stop_tty(port->port.tty);
+#endif
 	}
 	spin_unlock_irqrestore(&port->port_lock, flags);
 
diff --git a/fs/Kconfig b/fs/Kconfig
index 9adee0d..5c1e6f4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -112,6 +112,7 @@ if BLOCK
 menu "DOS/FAT/NT Filesystems"
 
 source "fs/fat/Kconfig"
+source "fs/exfat/Kconfig"
 source "fs/ntfs/Kconfig"
 
 endmenu
@@ -232,6 +233,7 @@ source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
+source "fs/aufs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/Makefile b/fs/Makefile
index 79f5225..0c61756 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -76,6 +76,7 @@ obj-$(CONFIG_HUGETLBFS)		+= hugetlbfs/
 obj-$(CONFIG_CODA_FS)		+= coda/
 obj-$(CONFIG_MINIX_FS)		+= minix/
 obj-$(CONFIG_FAT_FS)		+= fat/
+obj-$(CONFIG_EXFAT_FS)		+= exfat/
 obj-$(CONFIG_BFS_FS)		+= bfs/
 obj-$(CONFIG_ISO9660_FS)	+= isofs/
 obj-$(CONFIG_HFSPLUS_FS)	+= hfsplus/ # Before hfs to find wrapped HFS+
@@ -126,3 +127,4 @@ obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
+obj-$(CONFIG_AUFS_FS)           += aufs/
diff --git b/fs/aufs/Kconfig b/fs/aufs/Kconfig
new file mode 100644
index 0000000..63560ce
--- /dev/null
+++ b/fs/aufs/Kconfig
@@ -0,0 +1,185 @@
+config AUFS_FS
+	tristate "Aufs (Advanced multi layered unification filesystem) support"
+	help
+	Aufs is a stackable unification filesystem such as Unionfs,
+	which unifies several directories and provides a merged single
+	directory.
+	In the early days, aufs was entirely re-designed and
+	re-implemented Unionfs Version 1.x series. Introducing many
+	original ideas, approaches and improvements, it becomes totally
+	different from Unionfs while keeping the basic features.
+
+if AUFS_FS
+choice
+	prompt "Maximum number of branches"
+	default AUFS_BRANCH_MAX_127
+	help
+	Specifies the maximum number of branches (or member directories)
+	in a single aufs. The larger value consumes more system
+	resources and has a minor impact to performance.
+config AUFS_BRANCH_MAX_127
+	bool "127"
+	help
+	Specifies the maximum number of branches (or member directories)
+	in a single aufs. The larger value consumes more system
+	resources and has a minor impact to performance.
+config AUFS_BRANCH_MAX_511
+	bool "511"
+	help
+	Specifies the maximum number of branches (or member directories)
+	in a single aufs. The larger value consumes more system
+	resources and has a minor impact to performance.
+config AUFS_BRANCH_MAX_1023
+	bool "1023"
+	help
+	Specifies the maximum number of branches (or member directories)
+	in a single aufs. The larger value consumes more system
+	resources and has a minor impact to performance.
+config AUFS_BRANCH_MAX_32767
+	bool "32767"
+	help
+	Specifies the maximum number of branches (or member directories)
+	in a single aufs. The larger value consumes more system
+	resources and has a minor impact to performance.
+endchoice
+
+config AUFS_SBILIST
+	bool
+	depends on AUFS_MAGIC_SYSRQ || PROC_FS
+	default y
+	help
+	Automatic configuration for internal use.
+	When aufs supports Magic SysRq or /proc, enabled automatically.
+
+config AUFS_HNOTIFY
+	bool "Detect direct branch access (bypassing aufs)"
+	help
+	If you want to modify files on branches directly, eg. bypassing aufs,
+	and want aufs to detect the changes of them fully, then enable this
+	option and use 'udba=notify' mount option.
+	Currently there is only one available configuration, "fsnotify".
+	It will have a negative impact to the performance.
+	See detail in aufs.5.
+
+choice
+	prompt "method" if AUFS_HNOTIFY
+	default AUFS_HFSNOTIFY
+config AUFS_HFSNOTIFY
+	bool "fsnotify"
+	select FSNOTIFY
+endchoice
+
+config AUFS_EXPORT
+	bool "NFS-exportable aufs"
+	depends on EXPORTFS
+	help
+	If you want to export your mounted aufs via NFS, then enable this
+	option. There are several requirements for this configuration.
+	See detail in aufs.5.
+
+config AUFS_INO_T_64
+	bool
+	depends on AUFS_EXPORT
+	depends on 64BIT && !(ALPHA || S390)
+	default y
+	help
+	Automatic configuration for internal use.
+	/* typedef unsigned long/int __kernel_ino_t */
+	/* alpha and s390x are int */
+
+config AUFS_XATTR
+	bool "support for XATTR/EA (including Security Labels)"
+	help
+	If your branch fs supports XATTR/EA and you want to make them
+	available in aufs too, then enable this opsion and specify the
+	branch attributes for EA.
+	See detail in aufs.5.
+
+config AUFS_FHSM
+	bool "File-based Hierarchical Storage Management"
+	help
+	Hierarchical Storage Management (or HSM) is a well-known feature
+	in the storage world. Aufs provides this feature as file-based.
+	with multiple branches.
+	These multiple branches are prioritized, ie. the topmost one
+	should be the fastest drive and be used heavily.
+
+config AUFS_RDU
+	bool "Readdir in userspace"
+	help
+	Aufs has two methods to provide a merged view for a directory,
+	by a user-space library and by kernel-space natively. The latter
+	is always enabled but sometimes large and slow.
+	If you enable this option, install the library in aufs2-util
+	package, and set some environment variables for your readdir(3),
+	then the work will be handled in user-space which generally
+	shows better performance in most cases.
+	See detail in aufs.5.
+
+config AUFS_SHWH
+	bool "Show whiteouts"
+	help
+	If you want to make the whiteouts in aufs visible, then enable
+	this option and specify 'shwh' mount option. Although it may
+	sounds like philosophy or something, but in technically it
+	simply shows the name of whiteout with keeping its behaviour.
+
+config AUFS_BR_RAMFS
+	bool "Ramfs (initramfs/rootfs) as an aufs branch"
+	help
+	If you want to use ramfs as an aufs branch fs, then enable this
+	option. Generally tmpfs is recommended.
+	Aufs prohibited them to be a branch fs by default, because
+	initramfs becomes unusable after switch_root or something
+	generally. If you sets initramfs as an aufs branch and boot your
+	system by switch_root, you will meet a problem easily since the
+	files in initramfs may be inaccessible.
+	Unless you are going to use ramfs as an aufs branch fs without
+	switch_root or something, leave it N.
+
+config AUFS_BR_FUSE
+	bool "Fuse fs as an aufs branch"
+	depends on FUSE_FS
+	select AUFS_POLL
+	help
+	If you want to use fuse-based userspace filesystem as an aufs
+	branch fs, then enable this option.
+	It implements the internal poll(2) operation which is
+	implemented by fuse only (curretnly).
+
+config AUFS_POLL
+	bool
+	help
+	Automatic configuration for internal use.
+
+config AUFS_BR_HFSPLUS
+	bool "Hfsplus as an aufs branch"
+	depends on HFSPLUS_FS
+	default y
+	help
+	If you want to use hfsplus fs as an aufs branch fs, then enable
+	this option. This option introduces a small overhead at
+	copying-up a file on hfsplus.
+
+config AUFS_BDEV_LOOP
+	bool
+	depends on BLK_DEV_LOOP
+	default y
+	help
+	Automatic configuration for internal use.
+	Convert =[ym] into =y.
+
+config AUFS_DEBUG
+	bool "Debug aufs"
+	help
+	Enable this to compile aufs internal debug code.
+	It will have a negative impact to the performance.
+
+config AUFS_MAGIC_SYSRQ
+	bool
+	depends on AUFS_DEBUG && MAGIC_SYSRQ
+	default y
+	help
+	Automatic configuration for internal use.
+	When aufs supports Magic SysRq, enabled automatically.
+endif
diff --git b/fs/aufs/Makefile b/fs/aufs/Makefile
new file mode 100644
index 0000000..c7efb62
--- /dev/null
+++ b/fs/aufs/Makefile
@@ -0,0 +1,36 @@
+
+include ${srctree}/${src}/magic.mk
+
+# cf. include/linux/kernel.h
+# enable pr_debug
+ccflags-y += -DDEBUG
+# sparse requires the full pathname
+ccflags-y += -include ${srctree}/include/uapi/linux/aufs_type.h
+
+obj-$(CONFIG_AUFS_FS) += aufs.o
+aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \
+	wkq.o vfsub.o dcsub.o \
+	cpup.o whout.o wbr_policy.o \
+	dinfo.o dentry.o \
+	dynop.o \
+	finfo.o file.o f_op.o \
+	dir.o vdir.o \
+	iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \
+	mvdown.o ioctl.o
+
+# all are boolean
+aufs-$(CONFIG_PROC_FS) += procfs.o plink.o
+aufs-$(CONFIG_SYSFS) += sysfs.o
+aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o
+aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o
+aufs-$(CONFIG_AUFS_HNOTIFY) += hnotify.o
+aufs-$(CONFIG_AUFS_HFSNOTIFY) += hfsnotify.o
+aufs-$(CONFIG_AUFS_EXPORT) += export.o
+aufs-$(CONFIG_AUFS_XATTR) += xattr.o
+aufs-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
+aufs-$(CONFIG_AUFS_FHSM) += fhsm.o
+aufs-$(CONFIG_AUFS_POLL) += poll.o
+aufs-$(CONFIG_AUFS_RDU) += rdu.o
+aufs-$(CONFIG_AUFS_BR_HFSPLUS) += hfsplus.o
+aufs-$(CONFIG_AUFS_DEBUG) += debug.o
+aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o
diff --git b/fs/aufs/aufs.h b/fs/aufs/aufs.h
new file mode 100644
index 0000000..49f43b4
--- /dev/null
+++ b/fs/aufs/aufs.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * all header files
+ */
+
+#ifndef __AUFS_H__
+#define __AUFS_H__
+
+#ifdef __KERNEL__
+
+#define AuStub(type, name, body, ...) \
+	static inline type name(__VA_ARGS__) { body; }
+
+#define AuStubVoid(name, ...) \
+	AuStub(void, name, , __VA_ARGS__)
+#define AuStubInt0(name, ...) \
+	AuStub(int, name, return 0, __VA_ARGS__)
+
+#include "debug.h"
+
+#include "branch.h"
+#include "cpup.h"
+#include "dcsub.h"
+#include "dbgaufs.h"
+#include "dentry.h"
+#include "dir.h"
+#include "dynop.h"
+#include "file.h"
+#include "fstype.h"
+#include "inode.h"
+#include "loop.h"
+#include "module.h"
+#include "opts.h"
+#include "rwsem.h"
+#include "spl.h"
+#include "super.h"
+#include "sysaufs.h"
+#include "vfsub.h"
+#include "whout.h"
+#include "wkq.h"
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_H__ */
diff --git b/fs/aufs/branch.c b/fs/aufs/branch.c
new file mode 100644
index 0000000..496bdaa
--- /dev/null
+++ b/fs/aufs/branch.c
@@ -0,0 +1,1394 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * branch management
+ */
+
+#include <linux/compat.h>
+#include <linux/statfs.h>
+#include "aufs.h"
+
+/*
+ * free a single branch
+ */
+static void au_br_do_free(struct au_branch *br)
+{
+	int i;
+	struct au_wbr *wbr;
+	struct au_dykey **key;
+
+	au_hnotify_fin_br(br);
+
+	if (br->br_xino.xi_file)
+		fput(br->br_xino.xi_file);
+	mutex_destroy(&br->br_xino.xi_nondir_mtx);
+
+	AuDebugOn(atomic_read(&br->br_count));
+
+	wbr = br->br_wbr;
+	if (wbr) {
+		for (i = 0; i < AuBrWh_Last; i++)
+			dput(wbr->wbr_wh[i]);
+		AuDebugOn(atomic_read(&wbr->wbr_wh_running));
+		AuRwDestroy(&wbr->wbr_wh_rwsem);
+	}
+
+	if (br->br_fhsm) {
+		au_br_fhsm_fin(br->br_fhsm);
+		kfree(br->br_fhsm);
+	}
+
+	key = br->br_dykey;
+	for (i = 0; i < AuBrDynOp; i++, key++)
+		if (*key)
+			au_dy_put(*key);
+		else
+			break;
+
+	/* recursive lock, s_umount of branch's */
+	lockdep_off();
+	path_put(&br->br_path);
+	lockdep_on();
+	kfree(wbr);
+	kfree(br);
+}
+
+/*
+ * frees all branches
+ */
+void au_br_free(struct au_sbinfo *sbinfo)
+{
+	aufs_bindex_t bmax;
+	struct au_branch **br;
+
+	AuRwMustWriteLock(&sbinfo->si_rwsem);
+
+	bmax = sbinfo->si_bend + 1;
+	br = sbinfo->si_branch;
+	while (bmax--)
+		au_br_do_free(*br++);
+}
+
+/*
+ * find the index of a branch which is specified by @br_id.
+ */
+int au_br_index(struct super_block *sb, aufs_bindex_t br_id)
+{
+	aufs_bindex_t bindex, bend;
+
+	bend = au_sbend(sb);
+	for (bindex = 0; bindex <= bend; bindex++)
+		if (au_sbr_id(sb, bindex) == br_id)
+			return bindex;
+	return -1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * add a branch
+ */
+
+static int test_overlap(struct super_block *sb, struct dentry *h_adding,
+			struct dentry *h_root)
+{
+	if (unlikely(h_adding == h_root
+		     || au_test_loopback_overlap(sb, h_adding)))
+		return 1;
+	if (h_adding->d_sb != h_root->d_sb)
+		return 0;
+	return au_test_subdir(h_adding, h_root)
+		|| au_test_subdir(h_root, h_adding);
+}
+
+/*
+ * returns a newly allocated branch. @new_nbranch is a number of branches
+ * after adding a branch.
+ */
+static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch,
+				     int perm)
+{
+	struct au_branch *add_branch;
+	struct dentry *root;
+	struct inode *inode;
+	int err;
+
+	err = -ENOMEM;
+	root = sb->s_root;
+	add_branch = kzalloc(sizeof(*add_branch), GFP_NOFS);
+	if (unlikely(!add_branch))
+		goto out;
+
+	err = au_hnotify_init_br(add_branch, perm);
+	if (unlikely(err))
+		goto out_br;
+
+	if (au_br_writable(perm)) {
+		/* may be freed separately at changing the branch permission */
+		add_branch->br_wbr = kzalloc(sizeof(*add_branch->br_wbr),
+					     GFP_NOFS);
+		if (unlikely(!add_branch->br_wbr))
+			goto out_hnotify;
+	}
+
+	if (au_br_fhsm(perm)) {
+		err = au_fhsm_br_alloc(add_branch);
+		if (unlikely(err))
+			goto out_wbr;
+	}
+
+	err = au_sbr_realloc(au_sbi(sb), new_nbranch);
+	if (!err)
+		err = au_di_realloc(au_di(root), new_nbranch);
+	if (!err) {
+		inode = d_inode(root);
+		err = au_ii_realloc(au_ii(inode), new_nbranch);
+	}
+	if (!err)
+		return add_branch; /* success */
+
+out_wbr:
+	kfree(add_branch->br_wbr);
+out_hnotify:
+	au_hnotify_fin_br(add_branch);
+out_br:
+	kfree(add_branch);
+out:
+	return ERR_PTR(err);
+}
+
+/*
+ * test if the branch permission is legal or not.
+ */
+static int test_br(struct inode *inode, int brperm, char *path)
+{
+	int err;
+
+	err = (au_br_writable(brperm) && IS_RDONLY(inode));
+	if (!err)
+		goto out;
+
+	err = -EINVAL;
+	pr_err("write permission for readonly mount or inode, %s\n", path);
+
+out:
+	return err;
+}
+
+/*
+ * returns:
+ * 0: success, the caller will add it
+ * plus: success, it is already unified, the caller should ignore it
+ * minus: error
+ */
+static int test_add(struct super_block *sb, struct au_opt_add *add, int remount)
+{
+	int err;
+	aufs_bindex_t bend, bindex;
+	struct dentry *root, *h_dentry;
+	struct inode *inode, *h_inode;
+
+	root = sb->s_root;
+	bend = au_sbend(sb);
+	if (unlikely(bend >= 0
+		     && au_find_dbindex(root, add->path.dentry) >= 0)) {
+		err = 1;
+		if (!remount) {
+			err = -EINVAL;
+			pr_err("%s duplicated\n", add->pathname);
+		}
+		goto out;
+	}
+
+	err = -ENOSPC; /* -E2BIG; */
+	if (unlikely(AUFS_BRANCH_MAX <= add->bindex
+		     || AUFS_BRANCH_MAX - 1 <= bend)) {
+		pr_err("number of branches exceeded %s\n", add->pathname);
+		goto out;
+	}
+
+	err = -EDOM;
+	if (unlikely(add->bindex < 0 || bend + 1 < add->bindex)) {
+		pr_err("bad index %d\n", add->bindex);
+		goto out;
+	}
+
+	inode = d_inode(add->path.dentry);
+	err = -ENOENT;
+	if (unlikely(!inode->i_nlink)) {
+		pr_err("no existence %s\n", add->pathname);
+		goto out;
+	}
+
+	err = -EINVAL;
+	if (unlikely(inode->i_sb == sb)) {
+		pr_err("%s must be outside\n", add->pathname);
+		goto out;
+	}
+
+	if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) {
+		pr_err("unsupported filesystem, %s (%s)\n",
+		       add->pathname, au_sbtype(inode->i_sb));
+		goto out;
+	}
+
+	if (unlikely(inode->i_sb->s_stack_depth)) {
+		pr_err("already stacked, %s (%s)\n",
+		       add->pathname, au_sbtype(inode->i_sb));
+		goto out;
+	}
+
+	err = test_br(d_inode(add->path.dentry), add->perm, add->pathname);
+	if (unlikely(err))
+		goto out;
+
+	if (bend < 0)
+		return 0; /* success */
+
+	err = -EINVAL;
+	for (bindex = 0; bindex <= bend; bindex++)
+		if (unlikely(test_overlap(sb, add->path.dentry,
+					  au_h_dptr(root, bindex)))) {
+			pr_err("%s is overlapped\n", add->pathname);
+			goto out;
+		}
+
+	err = 0;
+	if (au_opt_test(au_mntflags(sb), WARN_PERM)) {
+		h_dentry = au_h_dptr(root, 0);
+		h_inode = d_inode(h_dentry);
+		if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO)
+		    || !uid_eq(h_inode->i_uid, inode->i_uid)
+		    || !gid_eq(h_inode->i_gid, inode->i_gid))
+			pr_warn("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n",
+				add->pathname,
+				i_uid_read(inode), i_gid_read(inode),
+				(inode->i_mode & S_IALLUGO),
+				i_uid_read(h_inode), i_gid_read(h_inode),
+				(h_inode->i_mode & S_IALLUGO));
+	}
+
+out:
+	return err;
+}
+
+/*
+ * initialize or clean the whiteouts for an adding branch
+ */
+static int au_br_init_wh(struct super_block *sb, struct au_branch *br,
+			 int new_perm)
+{
+	int err, old_perm;
+	aufs_bindex_t bindex;
+	struct inode *h_inode;
+	struct au_wbr *wbr;
+	struct au_hinode *hdir;
+	struct dentry *h_dentry;
+
+	err = vfsub_mnt_want_write(au_br_mnt(br));
+	if (unlikely(err))
+		goto out;
+
+	wbr = br->br_wbr;
+	old_perm = br->br_perm;
+	br->br_perm = new_perm;
+	hdir = NULL;
+	h_inode = NULL;
+	bindex = au_br_index(sb, br->br_id);
+	if (0 <= bindex) {
+		hdir = au_hi(d_inode(sb->s_root), bindex);
+		au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
+	} else {
+		h_dentry = au_br_dentry(br);
+		h_inode = d_inode(h_dentry);
+		inode_lock_nested(h_inode, AuLsc_I_PARENT);
+	}
+	if (!wbr)
+		err = au_wh_init(br, sb);
+	else {
+		wbr_wh_write_lock(wbr);
+		err = au_wh_init(br, sb);
+		wbr_wh_write_unlock(wbr);
+	}
+	if (hdir)
+		au_hn_imtx_unlock(hdir);
+	else
+		inode_unlock(h_inode);
+	vfsub_mnt_drop_write(au_br_mnt(br));
+	br->br_perm = old_perm;
+
+	if (!err && wbr && !au_br_writable(new_perm)) {
+		kfree(wbr);
+		br->br_wbr = NULL;
+	}
+
+out:
+	return err;
+}
+
+static int au_wbr_init(struct au_branch *br, struct super_block *sb,
+		       int perm)
+{
+	int err;
+	struct kstatfs kst;
+	struct au_wbr *wbr;
+
+	wbr = br->br_wbr;
+	au_rw_init(&wbr->wbr_wh_rwsem);
+	atomic_set(&wbr->wbr_wh_running, 0);
+
+	/*
+	 * a limit for rmdir/rename a dir
+	 * cf. AUFS_MAX_NAMELEN in include/uapi/linux/aufs_type.h
+	 */
+	err = vfs_statfs(&br->br_path, &kst);
+	if (unlikely(err))
+		goto out;
+	err = -EINVAL;
+	if (kst.f_namelen >= NAME_MAX)
+		err = au_br_init_wh(sb, br, perm);
+	else
+		pr_err("%pd(%s), unsupported namelen %ld\n",
+		       au_br_dentry(br),
+		       au_sbtype(au_br_dentry(br)->d_sb), kst.f_namelen);
+
+out:
+	return err;
+}
+
+/* initialize a new branch */
+static int au_br_init(struct au_branch *br, struct super_block *sb,
+		      struct au_opt_add *add)
+{
+	int err;
+	struct inode *h_inode;
+
+	err = 0;
+	mutex_init(&br->br_xino.xi_nondir_mtx);
+	br->br_perm = add->perm;
+	br->br_path = add->path; /* set first, path_get() later */
+	spin_lock_init(&br->br_dykey_lock);
+	atomic_set(&br->br_count, 0);
+	atomic_set(&br->br_xino_running, 0);
+	br->br_id = au_new_br_id(sb);
+	AuDebugOn(br->br_id < 0);
+
+	if (au_br_writable(add->perm)) {
+		err = au_wbr_init(br, sb, add->perm);
+		if (unlikely(err))
+			goto out_err;
+	}
+
+	if (au_opt_test(au_mntflags(sb), XINO)) {
+		h_inode = d_inode(add->path.dentry);
+		err = au_xino_br(sb, br, h_inode->i_ino,
+				 au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1);
+		if (unlikely(err)) {
+			AuDebugOn(br->br_xino.xi_file);
+			goto out_err;
+		}
+	}
+
+	sysaufs_br_init(br);
+	path_get(&br->br_path);
+	goto out; /* success */
+
+out_err:
+	memset(&br->br_path, 0, sizeof(br->br_path));
+out:
+	return err;
+}
+
+static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex,
+			     struct au_branch *br, aufs_bindex_t bend,
+			     aufs_bindex_t amount)
+{
+	struct au_branch **brp;
+
+	AuRwMustWriteLock(&sbinfo->si_rwsem);
+
+	brp = sbinfo->si_branch + bindex;
+	memmove(brp + 1, brp, sizeof(*brp) * amount);
+	*brp = br;
+	sbinfo->si_bend++;
+	if (unlikely(bend < 0))
+		sbinfo->si_bend = 0;
+}
+
+static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex,
+			     aufs_bindex_t bend, aufs_bindex_t amount)
+{
+	struct au_hdentry *hdp;
+
+	AuRwMustWriteLock(&dinfo->di_rwsem);
+
+	hdp = dinfo->di_hdentry + bindex;
+	memmove(hdp + 1, hdp, sizeof(*hdp) * amount);
+	au_h_dentry_init(hdp);
+	dinfo->di_bend++;
+	if (unlikely(bend < 0))
+		dinfo->di_bstart = 0;
+}
+
+static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex,
+			     aufs_bindex_t bend, aufs_bindex_t amount)
+{
+	struct au_hinode *hip;
+
+	AuRwMustWriteLock(&iinfo->ii_rwsem);
+
+	hip = iinfo->ii_hinode + bindex;
+	memmove(hip + 1, hip, sizeof(*hip) * amount);
+	hip->hi_inode = NULL;
+	au_hn_init(hip);
+	iinfo->ii_bend++;
+	if (unlikely(bend < 0))
+		iinfo->ii_bstart = 0;
+}
+
+static void au_br_do_add(struct super_block *sb, struct au_branch *br,
+			 aufs_bindex_t bindex)
+{
+	struct dentry *root, *h_dentry;
+	struct inode *root_inode, *h_inode;
+	aufs_bindex_t bend, amount;
+
+	root = sb->s_root;
+	root_inode = d_inode(root);
+	bend = au_sbend(sb);
+	amount = bend + 1 - bindex;
+	h_dentry = au_br_dentry(br);
+	au_sbilist_lock();
+	au_br_do_add_brp(au_sbi(sb), bindex, br, bend, amount);
+	au_br_do_add_hdp(au_di(root), bindex, bend, amount);
+	au_br_do_add_hip(au_ii(root_inode), bindex, bend, amount);
+	au_set_h_dptr(root, bindex, dget(h_dentry));
+	h_inode = d_inode(h_dentry);
+	au_set_h_iptr(root_inode, bindex, au_igrab(h_inode), /*flags*/0);
+	au_sbilist_unlock();
+}
+
+int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount)
+{
+	int err;
+	aufs_bindex_t bend, add_bindex;
+	struct dentry *root, *h_dentry;
+	struct inode *root_inode;
+	struct au_branch *add_branch;
+
+	root = sb->s_root;
+	root_inode = d_inode(root);
+	IMustLock(root_inode);
+	err = test_add(sb, add, remount);
+	if (unlikely(err < 0))
+		goto out;
+	if (err) {
+		err = 0;
+		goto out; /* success */
+	}
+
+	bend = au_sbend(sb);
+	add_branch = au_br_alloc(sb, bend + 2, add->perm);
+	err = PTR_ERR(add_branch);
+	if (IS_ERR(add_branch))
+		goto out;
+
+	err = au_br_init(add_branch, sb, add);
+	if (unlikely(err)) {
+		au_br_do_free(add_branch);
+		goto out;
+	}
+
+	add_bindex = add->bindex;
+	if (!remount)
+		au_br_do_add(sb, add_branch, add_bindex);
+	else {
+		sysaufs_brs_del(sb, add_bindex);
+		au_br_do_add(sb, add_branch, add_bindex);
+		sysaufs_brs_add(sb, add_bindex);
+	}
+
+	h_dentry = add->path.dentry;
+	if (!add_bindex) {
+		au_cpup_attr_all(root_inode, /*force*/1);
+		sb->s_maxbytes = h_dentry->d_sb->s_maxbytes;
+	} else
+		au_add_nlink(root_inode, d_inode(h_dentry));
+
+	/*
+	 * this test/set prevents aufs from handling unnecesary notify events
+	 * of xino files, in case of re-adding a writable branch which was
+	 * once detached from aufs.
+	 */
+	if (au_xino_brid(sb) < 0
+	    && au_br_writable(add_branch->br_perm)
+	    && !au_test_fs_bad_xino(h_dentry->d_sb)
+	    && add_branch->br_xino.xi_file
+	    && add_branch->br_xino.xi_file->f_path.dentry->d_parent == h_dentry)
+		au_xino_brid_set(sb, add_branch->br_id);
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static unsigned long long au_farray_cb(struct super_block *sb, void *a,
+				       unsigned long long max __maybe_unused,
+				       void *arg)
+{
+	unsigned long long n;
+	struct file **p, *f;
+	struct au_sphlhead *files;
+	struct au_finfo *finfo;
+
+	n = 0;
+	p = a;
+	files = &au_sbi(sb)->si_files;
+	spin_lock(&files->spin);
+	hlist_for_each_entry(finfo, &files->head, fi_hlist) {
+		f = finfo->fi_file;
+		if (file_count(f)
+		    && !special_file(file_inode(f)->i_mode)) {
+			get_file(f);
+			*p++ = f;
+			n++;
+			AuDebugOn(n > max);
+		}
+	}
+	spin_unlock(&files->spin);
+
+	return n;
+}
+
+static struct file **au_farray_alloc(struct super_block *sb,
+				     unsigned long long *max)
+{
+	*max = atomic_long_read(&au_sbi(sb)->si_nfiles);
+	return au_array_alloc(max, au_farray_cb, sb, /*arg*/NULL);
+}
+
+static void au_farray_free(struct file **a, unsigned long long max)
+{
+	unsigned long long ull;
+
+	for (ull = 0; ull < max; ull++)
+		if (a[ull])
+			fput(a[ull]);
+	kvfree(a);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * delete a branch
+ */
+
+/* to show the line number, do not make it inlined function */
+#define AuVerbose(do_info, fmt, ...) do { \
+	if (do_info) \
+		pr_info(fmt, ##__VA_ARGS__); \
+} while (0)
+
+static int au_test_ibusy(struct inode *inode, aufs_bindex_t bstart,
+			 aufs_bindex_t bend)
+{
+	return (inode && !S_ISDIR(inode->i_mode)) || bstart == bend;
+}
+
+static int au_test_dbusy(struct dentry *dentry, aufs_bindex_t bstart,
+			 aufs_bindex_t bend)
+{
+	return au_test_ibusy(d_inode(dentry), bstart, bend);
+}
+
+/*
+ * test if the branch is deletable or not.
+ */
+static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex,
+			    unsigned int sigen, const unsigned int verbose)
+{
+	int err, i, j, ndentry;
+	aufs_bindex_t bstart, bend;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry *d;
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	err = au_dcsub_pages(&dpages, root, NULL, NULL);
+	if (unlikely(err))
+		goto out_dpages;
+
+	for (i = 0; !err && i < dpages.ndpage; i++) {
+		dpage = dpages.dpages + i;
+		ndentry = dpage->ndentry;
+		for (j = 0; !err && j < ndentry; j++) {
+			d = dpage->dentries[j];
+			AuDebugOn(au_dcount(d) <= 0);
+			if (!au_digen_test(d, sigen)) {
+				di_read_lock_child(d, AuLock_IR);
+				if (unlikely(au_dbrange_test(d))) {
+					di_read_unlock(d, AuLock_IR);
+					continue;
+				}
+			} else {
+				di_write_lock_child(d);
+				if (unlikely(au_dbrange_test(d))) {
+					di_write_unlock(d);
+					continue;
+				}
+				err = au_reval_dpath(d, sigen);
+				if (!err)
+					di_downgrade_lock(d, AuLock_IR);
+				else {
+					di_write_unlock(d);
+					break;
+				}
+			}
+
+			/* AuDbgDentry(d); */
+			bstart = au_dbstart(d);
+			bend = au_dbend(d);
+			if (bstart <= bindex
+			    && bindex <= bend
+			    && au_h_dptr(d, bindex)
+			    && au_test_dbusy(d, bstart, bend)) {
+				err = -EBUSY;
+				AuVerbose(verbose, "busy %pd\n", d);
+				AuDbgDentry(d);
+			}
+			di_read_unlock(d, AuLock_IR);
+		}
+	}
+
+out_dpages:
+	au_dpages_free(&dpages);
+out:
+	return err;
+}
+
+static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex,
+			   unsigned int sigen, const unsigned int verbose)
+{
+	int err;
+	unsigned long long max, ull;
+	struct inode *i, **array;
+	aufs_bindex_t bstart, bend;
+
+	array = au_iarray_alloc(sb, &max);
+	err = PTR_ERR(array);
+	if (IS_ERR(array))
+		goto out;
+
+	err = 0;
+	AuDbg("b%d\n", bindex);
+	for (ull = 0; !err && ull < max; ull++) {
+		i = array[ull];
+		if (unlikely(!i))
+			break;
+		if (i->i_ino == AUFS_ROOT_INO)
+			continue;
+
+		/* AuDbgInode(i); */
+		if (au_iigen(i, NULL) == sigen)
+			ii_read_lock_child(i);
+		else {
+			ii_write_lock_child(i);
+			err = au_refresh_hinode_self(i);
+			au_iigen_dec(i);
+			if (!err)
+				ii_downgrade_lock(i);
+			else {
+				ii_write_unlock(i);
+				break;
+			}
+		}
+
+		bstart = au_ibstart(i);
+		bend = au_ibend(i);
+		if (bstart <= bindex
+		    && bindex <= bend
+		    && au_h_iptr(i, bindex)
+		    && au_test_ibusy(i, bstart, bend)) {
+			err = -EBUSY;
+			AuVerbose(verbose, "busy i%lu\n", i->i_ino);
+			AuDbgInode(i);
+		}
+		ii_read_unlock(i);
+	}
+	au_iarray_free(array, max);
+
+out:
+	return err;
+}
+
+static int test_children_busy(struct dentry *root, aufs_bindex_t bindex,
+			      const unsigned int verbose)
+{
+	int err;
+	unsigned int sigen;
+
+	sigen = au_sigen(root->d_sb);
+	DiMustNoWaiters(root);
+	IiMustNoWaiters(d_inode(root));
+	di_write_unlock(root);
+	err = test_dentry_busy(root, bindex, sigen, verbose);
+	if (!err)
+		err = test_inode_busy(root->d_sb, bindex, sigen, verbose);
+	di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */
+
+	return err;
+}
+
+static int test_dir_busy(struct file *file, aufs_bindex_t br_id,
+			 struct file **to_free, int *idx)
+{
+	int err;
+	unsigned char matched, root;
+	aufs_bindex_t bindex, bend;
+	struct au_fidir *fidir;
+	struct au_hfile *hfile;
+
+	err = 0;
+	root = IS_ROOT(file->f_path.dentry);
+	if (root) {
+		get_file(file);
+		to_free[*idx] = file;
+		(*idx)++;
+		goto out;
+	}
+
+	matched = 0;
+	fidir = au_fi(file)->fi_hdir;
+	AuDebugOn(!fidir);
+	bend = au_fbend_dir(file);
+	for (bindex = au_fbstart(file); bindex <= bend; bindex++) {
+		hfile = fidir->fd_hfile + bindex;
+		if (!hfile->hf_file)
+			continue;
+
+		if (hfile->hf_br->br_id == br_id) {
+			matched = 1;
+			break;
+		}
+	}
+	if (matched)
+		err = -EBUSY;
+
+out:
+	return err;
+}
+
+static int test_file_busy(struct super_block *sb, aufs_bindex_t br_id,
+			  struct file **to_free, int opened)
+{
+	int err, idx;
+	unsigned long long ull, max;
+	aufs_bindex_t bstart;
+	struct file *file, **array;
+	struct dentry *root;
+	struct au_hfile *hfile;
+
+	array = au_farray_alloc(sb, &max);
+	err = PTR_ERR(array);
+	if (IS_ERR(array))
+		goto out;
+
+	err = 0;
+	idx = 0;
+	root = sb->s_root;
+	di_write_unlock(root);
+	for (ull = 0; ull < max; ull++) {
+		file = array[ull];
+		if (unlikely(!file))
+			break;
+
+		/* AuDbg("%pD\n", file); */
+		fi_read_lock(file);
+		bstart = au_fbstart(file);
+		if (!d_is_dir(file->f_path.dentry)) {
+			hfile = &au_fi(file)->fi_htop;
+			if (hfile->hf_br->br_id == br_id)
+				err = -EBUSY;
+		} else
+			err = test_dir_busy(file, br_id, to_free, &idx);
+		fi_read_unlock(file);
+		if (unlikely(err))
+			break;
+	}
+	di_write_lock_child(root);
+	au_farray_free(array, max);
+	AuDebugOn(idx > opened);
+
+out:
+	return err;
+}
+
+static void br_del_file(struct file **to_free, unsigned long long opened,
+			  aufs_bindex_t br_id)
+{
+	unsigned long long ull;
+	aufs_bindex_t bindex, bstart, bend, bfound;
+	struct file *file;
+	struct au_fidir *fidir;
+	struct au_hfile *hfile;
+
+	for (ull = 0; ull < opened; ull++) {
+		file = to_free[ull];
+		if (unlikely(!file))
+			break;
+
+		/* AuDbg("%pD\n", file); */
+		AuDebugOn(!d_is_dir(file->f_path.dentry));
+		bfound = -1;
+		fidir = au_fi(file)->fi_hdir;
+		AuDebugOn(!fidir);
+		fi_write_lock(file);
+		bstart = au_fbstart(file);
+		bend = au_fbend_dir(file);
+		for (bindex = bstart; bindex <= bend; bindex++) {
+			hfile = fidir->fd_hfile + bindex;
+			if (!hfile->hf_file)
+				continue;
+
+			if (hfile->hf_br->br_id == br_id) {
+				bfound = bindex;
+				break;
+			}
+		}
+		AuDebugOn(bfound < 0);
+		au_set_h_fptr(file, bfound, NULL);
+		if (bfound == bstart) {
+			for (bstart++; bstart <= bend; bstart++)
+				if (au_hf_dir(file, bstart)) {
+					au_set_fbstart(file, bstart);
+					break;
+				}
+		}
+		fi_write_unlock(file);
+	}
+}
+
+static void au_br_do_del_brp(struct au_sbinfo *sbinfo,
+			     const aufs_bindex_t bindex,
+			     const aufs_bindex_t bend)
+{
+	struct au_branch **brp, **p;
+
+	AuRwMustWriteLock(&sbinfo->si_rwsem);
+
+	brp = sbinfo->si_branch + bindex;
+	if (bindex < bend)
+		memmove(brp, brp + 1, sizeof(*brp) * (bend - bindex));
+	sbinfo->si_branch[0 + bend] = NULL;
+	sbinfo->si_bend--;
+
+	p = krealloc(sbinfo->si_branch, sizeof(*p) * bend, AuGFP_SBILIST);
+	if (p)
+		sbinfo->si_branch = p;
+	/* harmless error */
+}
+
+static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex,
+			     const aufs_bindex_t bend)
+{
+	struct au_hdentry *hdp, *p;
+
+	AuRwMustWriteLock(&dinfo->di_rwsem);
+
+	hdp = dinfo->di_hdentry;
+	if (bindex < bend)
+		memmove(hdp + bindex, hdp + bindex + 1,
+			sizeof(*hdp) * (bend - bindex));
+	hdp[0 + bend].hd_dentry = NULL;
+	dinfo->di_bend--;
+
+	p = krealloc(hdp, sizeof(*p) * bend, AuGFP_SBILIST);
+	if (p)
+		dinfo->di_hdentry = p;
+	/* harmless error */
+}
+
+static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex,
+			     const aufs_bindex_t bend)
+{
+	struct au_hinode *hip, *p;
+
+	AuRwMustWriteLock(&iinfo->ii_rwsem);
+
+	hip = iinfo->ii_hinode + bindex;
+	if (bindex < bend)
+		memmove(hip, hip + 1, sizeof(*hip) * (bend - bindex));
+	iinfo->ii_hinode[0 + bend].hi_inode = NULL;
+	au_hn_init(iinfo->ii_hinode + bend);
+	iinfo->ii_bend--;
+
+	p = krealloc(iinfo->ii_hinode, sizeof(*p) * bend, AuGFP_SBILIST);
+	if (p)
+		iinfo->ii_hinode = p;
+	/* harmless error */
+}
+
+static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex,
+			 struct au_branch *br)
+{
+	aufs_bindex_t bend;
+	struct au_sbinfo *sbinfo;
+	struct dentry *root, *h_root;
+	struct inode *inode, *h_inode;
+	struct au_hinode *hinode;
+
+	SiMustWriteLock(sb);
+
+	root = sb->s_root;
+	inode = d_inode(root);
+	sbinfo = au_sbi(sb);
+	bend = sbinfo->si_bend;
+
+	h_root = au_h_dptr(root, bindex);
+	hinode = au_hi(inode, bindex);
+	h_inode = au_igrab(hinode->hi_inode);
+	au_hiput(hinode);
+
+	au_sbilist_lock();
+	au_br_do_del_brp(sbinfo, bindex, bend);
+	au_br_do_del_hdp(au_di(root), bindex, bend);
+	au_br_do_del_hip(au_ii(inode), bindex, bend);
+	au_sbilist_unlock();
+
+	dput(h_root);
+	iput(h_inode);
+	au_br_do_free(br);
+}
+
+static unsigned long long empty_cb(struct super_block *sb, void *array,
+				   unsigned long long max, void *arg)
+{
+	return max;
+}
+
+int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount)
+{
+	int err, rerr, i;
+	unsigned long long opened;
+	unsigned int mnt_flags;
+	aufs_bindex_t bindex, bend, br_id;
+	unsigned char do_wh, verbose;
+	struct au_branch *br;
+	struct au_wbr *wbr;
+	struct dentry *root;
+	struct file **to_free;
+
+	err = 0;
+	opened = 0;
+	to_free = NULL;
+	root = sb->s_root;
+	bindex = au_find_dbindex(root, del->h_path.dentry);
+	if (bindex < 0) {
+		if (remount)
+			goto out; /* success */
+		err = -ENOENT;
+		pr_err("%s no such branch\n", del->pathname);
+		goto out;
+	}
+	AuDbg("bindex b%d\n", bindex);
+
+	err = -EBUSY;
+	mnt_flags = au_mntflags(sb);
+	verbose = !!au_opt_test(mnt_flags, VERBOSE);
+	bend = au_sbend(sb);
+	if (unlikely(!bend)) {
+		AuVerbose(verbose, "no more branches left\n");
+		goto out;
+	}
+	br = au_sbr(sb, bindex);
+	AuDebugOn(!path_equal(&br->br_path, &del->h_path));
+
+	br_id = br->br_id;
+	opened = atomic_read(&br->br_count);
+	if (unlikely(opened)) {
+		to_free = au_array_alloc(&opened, empty_cb, sb, NULL);
+		err = PTR_ERR(to_free);
+		if (IS_ERR(to_free))
+			goto out;
+
+		err = test_file_busy(sb, br_id, to_free, opened);
+		if (unlikely(err)) {
+			AuVerbose(verbose, "%llu file(s) opened\n", opened);
+			goto out;
+		}
+	}
+
+	wbr = br->br_wbr;
+	do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph);
+	if (do_wh) {
+		/* instead of WbrWhMustWriteLock(wbr) */
+		SiMustWriteLock(sb);
+		for (i = 0; i < AuBrWh_Last; i++) {
+			dput(wbr->wbr_wh[i]);
+			wbr->wbr_wh[i] = NULL;
+		}
+	}
+
+	err = test_children_busy(root, bindex, verbose);
+	if (unlikely(err)) {
+		if (do_wh)
+			goto out_wh;
+		goto out;
+	}
+
+	err = 0;
+	if (to_free) {
+		/*
+		 * now we confirmed the branch is deletable.
+		 * let's free the remaining opened dirs on the branch.
+		 */
+		di_write_unlock(root);
+		br_del_file(to_free, opened, br_id);
+		di_write_lock_child(root);
+	}
+
+	if (!remount)
+		au_br_do_del(sb, bindex, br);
+	else {
+		sysaufs_brs_del(sb, bindex);
+		au_br_do_del(sb, bindex, br);
+		sysaufs_brs_add(sb, bindex);
+	}
+
+	if (!bindex) {
+		au_cpup_attr_all(d_inode(root), /*force*/1);
+		sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes;
+	} else
+		au_sub_nlink(d_inode(root), d_inode(del->h_path.dentry));
+	if (au_opt_test(mnt_flags, PLINK))
+		au_plink_half_refresh(sb, br_id);
+
+	if (au_xino_brid(sb) == br_id)
+		au_xino_brid_set(sb, -1);
+	goto out; /* success */
+
+out_wh:
+	/* revert */
+	rerr = au_br_init_wh(sb, br, br->br_perm);
+	if (rerr)
+		pr_warn("failed re-creating base whiteout, %s. (%d)\n",
+			del->pathname, rerr);
+out:
+	if (to_free)
+		au_farray_free(to_free, opened);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_ibusy(struct super_block *sb, struct aufs_ibusy __user *arg)
+{
+	int err;
+	aufs_bindex_t bstart, bend;
+	struct aufs_ibusy ibusy;
+	struct inode *inode, *h_inode;
+
+	err = -EPERM;
+	if (unlikely(!capable(CAP_SYS_ADMIN)))
+		goto out;
+
+	err = copy_from_user(&ibusy, arg, sizeof(ibusy));
+	if (!err)
+		err = !access_ok(VERIFY_WRITE, &arg->h_ino, sizeof(arg->h_ino));
+	if (unlikely(err)) {
+		err = -EFAULT;
+		AuTraceErr(err);
+		goto out;
+	}
+
+	err = -EINVAL;
+	si_read_lock(sb, AuLock_FLUSH);
+	if (unlikely(ibusy.bindex < 0 || ibusy.bindex > au_sbend(sb)))
+		goto out_unlock;
+
+	err = 0;
+	ibusy.h_ino = 0; /* invalid */
+	inode = ilookup(sb, ibusy.ino);
+	if (!inode
+	    || inode->i_ino == AUFS_ROOT_INO
+	    || is_bad_inode(inode))
+		goto out_unlock;
+
+	ii_read_lock_child(inode);
+	bstart = au_ibstart(inode);
+	bend = au_ibend(inode);
+	if (bstart <= ibusy.bindex && ibusy.bindex <= bend) {
+		h_inode = au_h_iptr(inode, ibusy.bindex);
+		if (h_inode && au_test_ibusy(inode, bstart, bend))
+			ibusy.h_ino = h_inode->i_ino;
+	}
+	ii_read_unlock(inode);
+	iput(inode);
+
+out_unlock:
+	si_read_unlock(sb);
+	if (!err) {
+		err = __put_user(ibusy.h_ino, &arg->h_ino);
+		if (unlikely(err)) {
+			err = -EFAULT;
+			AuTraceErr(err);
+		}
+	}
+out:
+	return err;
+}
+
+long au_ibusy_ioctl(struct file *file, unsigned long arg)
+{
+	return au_ibusy(file->f_path.dentry->d_sb, (void __user *)arg);
+}
+
+#ifdef CONFIG_COMPAT
+long au_ibusy_compat_ioctl(struct file *file, unsigned long arg)
+{
+	return au_ibusy(file->f_path.dentry->d_sb, compat_ptr(arg));
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * change a branch permission
+ */
+
+static void au_warn_ima(void)
+{
+#ifdef CONFIG_IMA
+	/* since it doesn't support mark_files_ro() */
+	AuWarn1("RW -> RO makes IMA to produce wrong message\n");
+#endif
+}
+
+static int do_need_sigen_inc(int a, int b)
+{
+	return au_br_whable(a) && !au_br_whable(b);
+}
+
+static int need_sigen_inc(int old, int new)
+{
+	return do_need_sigen_inc(old, new)
+		|| do_need_sigen_inc(new, old);
+}
+
+static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex)
+{
+	int err, do_warn;
+	unsigned int mnt_flags;
+	unsigned long long ull, max;
+	aufs_bindex_t br_id;
+	unsigned char verbose, writer;
+	struct file *file, *hf, **array;
+	struct au_hfile *hfile;
+
+	mnt_flags = au_mntflags(sb);
+	verbose = !!au_opt_test(mnt_flags, VERBOSE);
+
+	array = au_farray_alloc(sb, &max);
+	err = PTR_ERR(array);
+	if (IS_ERR(array))
+		goto out;
+
+	do_warn = 0;
+	br_id = au_sbr_id(sb, bindex);
+	for (ull = 0; ull < max; ull++) {
+		file = array[ull];
+		if (unlikely(!file))
+			break;
+
+		/* AuDbg("%pD\n", file); */
+		fi_read_lock(file);
+		if (unlikely(au_test_mmapped(file))) {
+			err = -EBUSY;
+			AuVerbose(verbose, "mmapped %pD\n", file);
+			AuDbgFile(file);
+			FiMustNoWaiters(file);
+			fi_read_unlock(file);
+			goto out_array;
+		}
+
+		hfile = &au_fi(file)->fi_htop;
+		hf = hfile->hf_file;
+		if (!d_is_reg(file->f_path.dentry)
+		    || !(file->f_mode & FMODE_WRITE)
+		    || hfile->hf_br->br_id != br_id
+		    || !(hf->f_mode & FMODE_WRITE))
+			array[ull] = NULL;
+		else {
+			do_warn = 1;
+			get_file(file);
+		}
+
+		FiMustNoWaiters(file);
+		fi_read_unlock(file);
+		fput(file);
+	}
+
+	err = 0;
+	if (do_warn)
+		au_warn_ima();
+
+	for (ull = 0; ull < max; ull++) {
+		file = array[ull];
+		if (!file)
+			continue;
+
+		/* todo: already flushed? */
+		/*
+		 * fs/super.c:mark_files_ro() is gone, but aufs keeps its
+		 * approach which resets f_mode and calls mnt_drop_write() and
+		 * file_release_write() for each file, because the branch
+		 * attribute in aufs world is totally different from the native
+		 * fs rw/ro mode.
+		*/
+		/* fi_read_lock(file); */
+		hfile = &au_fi(file)->fi_htop;
+		hf = hfile->hf_file;
+		/* fi_read_unlock(file); */
+		spin_lock(&hf->f_lock);
+		writer = !!(hf->f_mode & FMODE_WRITER);
+		hf->f_mode &= ~(FMODE_WRITE | FMODE_WRITER);
+		spin_unlock(&hf->f_lock);
+		if (writer) {
+			put_write_access(file_inode(hf));
+			__mnt_drop_write(hf->f_path.mnt);
+		}
+	}
+
+out_array:
+	au_farray_free(array, max);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount,
+	      int *do_refresh)
+{
+	int err, rerr;
+	aufs_bindex_t bindex;
+	struct dentry *root;
+	struct au_branch *br;
+	struct au_br_fhsm *bf;
+
+	root = sb->s_root;
+	bindex = au_find_dbindex(root, mod->h_root);
+	if (bindex < 0) {
+		if (remount)
+			return 0; /* success */
+		err = -ENOENT;
+		pr_err("%s no such branch\n", mod->path);
+		goto out;
+	}
+	AuDbg("bindex b%d\n", bindex);
+
+	err = test_br(d_inode(mod->h_root), mod->perm, mod->path);
+	if (unlikely(err))
+		goto out;
+
+	br = au_sbr(sb, bindex);
+	AuDebugOn(mod->h_root != au_br_dentry(br));
+	if (br->br_perm == mod->perm)
+		return 0; /* success */
+
+	/* pre-allocate for non-fhsm --> fhsm */
+	bf = NULL;
+	if (!au_br_fhsm(br->br_perm) && au_br_fhsm(mod->perm)) {
+		err = au_fhsm_br_alloc(br);
+		if (unlikely(err))
+			goto out;
+		bf = br->br_fhsm;
+		br->br_fhsm = NULL;
+	}
+
+	if (au_br_writable(br->br_perm)) {
+		/* remove whiteout base */
+		err = au_br_init_wh(sb, br, mod->perm);
+		if (unlikely(err))
+			goto out_bf;
+
+		if (!au_br_writable(mod->perm)) {
+			/* rw --> ro, file might be mmapped */
+			DiMustNoWaiters(root);
+			IiMustNoWaiters(d_inode(root));
+			di_write_unlock(root);
+			err = au_br_mod_files_ro(sb, bindex);
+			/* aufs_write_lock() calls ..._child() */
+			di_write_lock_child(root);
+
+			if (unlikely(err)) {
+				rerr = -ENOMEM;
+				br->br_wbr = kzalloc(sizeof(*br->br_wbr),
+						     GFP_NOFS);
+				if (br->br_wbr)
+					rerr = au_wbr_init(br, sb, br->br_perm);
+				if (unlikely(rerr)) {
+					AuIOErr("nested error %d (%d)\n",
+						rerr, err);
+					br->br_perm = mod->perm;
+				}
+			}
+		}
+	} else if (au_br_writable(mod->perm)) {
+		/* ro --> rw */
+		err = -ENOMEM;
+		br->br_wbr = kzalloc(sizeof(*br->br_wbr), GFP_NOFS);
+		if (br->br_wbr) {
+			err = au_wbr_init(br, sb, mod->perm);
+			if (unlikely(err)) {
+				kfree(br->br_wbr);
+				br->br_wbr = NULL;
+			}
+		}
+	}
+	if (unlikely(err))
+		goto out_bf;
+
+	if (au_br_fhsm(br->br_perm)) {
+		if (!au_br_fhsm(mod->perm)) {
+			/* fhsm --> non-fhsm */
+			au_br_fhsm_fin(br->br_fhsm);
+			kfree(br->br_fhsm);
+			br->br_fhsm = NULL;
+		}
+	} else if (au_br_fhsm(mod->perm))
+		/* non-fhsm --> fhsm */
+		br->br_fhsm = bf;
+
+	*do_refresh |= need_sigen_inc(br->br_perm, mod->perm);
+	br->br_perm = mod->perm;
+	goto out; /* success */
+
+out_bf:
+	kfree(bf);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs)
+{
+	int err;
+	struct kstatfs kstfs;
+
+	err = vfs_statfs(&br->br_path, &kstfs);
+	if (!err) {
+		stfs->f_blocks = kstfs.f_blocks;
+		stfs->f_bavail = kstfs.f_bavail;
+		stfs->f_files = kstfs.f_files;
+		stfs->f_ffree = kstfs.f_ffree;
+	}
+
+	return err;
+}
diff --git b/fs/aufs/branch.h b/fs/aufs/branch.h
new file mode 100644
index 0000000..4c52ae1
--- /dev/null
+++ b/fs/aufs/branch.h
@@ -0,0 +1,266 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * branch filesystems and xino for them
+ */
+
+#ifndef __AUFS_BRANCH_H__
+#define __AUFS_BRANCH_H__
+
+#ifdef __KERNEL__
+
+#include <linux/mount.h>
+#include "dynop.h"
+#include "rwsem.h"
+#include "super.h"
+
+/* ---------------------------------------------------------------------- */
+
+/* a xino file */
+struct au_xino_file {
+	struct file		*xi_file;
+	struct mutex		xi_nondir_mtx;
+
+	/* todo: make xino files an array to support huge inode number */
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry		 *xi_dbgaufs;
+#endif
+};
+
+/* File-based Hierarchical Storage Management */
+struct au_br_fhsm {
+#ifdef CONFIG_AUFS_FHSM
+	struct mutex		bf_lock;
+	unsigned long		bf_jiffy;
+	struct aufs_stfs	bf_stfs;
+	int			bf_readable;
+#endif
+};
+
+/* members for writable branch only */
+enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last};
+struct au_wbr {
+	struct au_rwsem		wbr_wh_rwsem;
+	struct dentry		*wbr_wh[AuBrWh_Last];
+	atomic_t		wbr_wh_running;
+#define wbr_whbase		wbr_wh[AuBrWh_BASE]	/* whiteout base */
+#define wbr_plink		wbr_wh[AuBrWh_PLINK]	/* pseudo-link dir */
+#define wbr_orph		wbr_wh[AuBrWh_ORPH]	/* dir for orphans */
+
+	/* mfs mode */
+	unsigned long long	wbr_bytes;
+};
+
+/* ext2 has 3 types of operations at least, ext3 has 4 */
+#define AuBrDynOp (AuDyLast * 4)
+
+#ifdef CONFIG_AUFS_HFSNOTIFY
+/* support for asynchronous destruction */
+struct au_br_hfsnotify {
+	struct fsnotify_group	*hfsn_group;
+};
+#endif
+
+/* sysfs entries */
+struct au_brsysfs {
+	char			name[16];
+	struct attribute	attr;
+};
+
+enum {
+	AuBrSysfs_BR,
+	AuBrSysfs_BRID,
+	AuBrSysfs_Last
+};
+
+/* protected by superblock rwsem */
+struct au_branch {
+	struct au_xino_file	br_xino;
+
+	aufs_bindex_t		br_id;
+
+	int			br_perm;
+	struct path		br_path;
+	spinlock_t		br_dykey_lock;
+	struct au_dykey		*br_dykey[AuBrDynOp];
+	atomic_t		br_count;
+
+	struct au_wbr		*br_wbr;
+	struct au_br_fhsm	*br_fhsm;
+
+	/* xino truncation */
+	atomic_t		br_xino_running;
+
+#ifdef CONFIG_AUFS_HFSNOTIFY
+	struct au_br_hfsnotify	*br_hfsn;
+#endif
+
+#ifdef CONFIG_SYSFS
+	/* entries under sysfs per mount-point */
+	struct au_brsysfs	br_sysfs[AuBrSysfs_Last];
+#endif
+};
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct vfsmount *au_br_mnt(struct au_branch *br)
+{
+	return br->br_path.mnt;
+}
+
+static inline struct dentry *au_br_dentry(struct au_branch *br)
+{
+	return br->br_path.dentry;
+}
+
+static inline struct super_block *au_br_sb(struct au_branch *br)
+{
+	return au_br_mnt(br)->mnt_sb;
+}
+
+static inline int au_br_rdonly(struct au_branch *br)
+{
+	return ((au_br_sb(br)->s_flags & MS_RDONLY)
+		|| !au_br_writable(br->br_perm))
+		? -EROFS : 0;
+}
+
+static inline int au_br_hnotifyable(int brperm __maybe_unused)
+{
+#ifdef CONFIG_AUFS_HNOTIFY
+	return !(brperm & AuBrPerm_RR);
+#else
+	return 0;
+#endif
+}
+
+static inline int au_br_test_oflag(int oflag, struct au_branch *br)
+{
+	int err, exec_flag;
+
+	err = 0;
+	exec_flag = oflag & __FMODE_EXEC;
+	if (unlikely(exec_flag && (au_br_mnt(br)->mnt_flags & MNT_NOEXEC)))
+		err = -EACCES;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* branch.c */
+struct au_sbinfo;
+void au_br_free(struct au_sbinfo *sinfo);
+int au_br_index(struct super_block *sb, aufs_bindex_t br_id);
+struct au_opt_add;
+int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount);
+struct au_opt_del;
+int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount);
+long au_ibusy_ioctl(struct file *file, unsigned long arg);
+#ifdef CONFIG_COMPAT
+long au_ibusy_compat_ioctl(struct file *file, unsigned long arg);
+#endif
+struct au_opt_mod;
+int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount,
+	      int *do_refresh);
+struct aufs_stfs;
+int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs);
+
+/* xino.c */
+static const loff_t au_loff_max = LLONG_MAX;
+
+int au_xib_trunc(struct super_block *sb);
+ssize_t xino_fread(vfs_readf_t func, struct file *file, void *buf, size_t size,
+		   loff_t *pos);
+ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf,
+		    size_t size, loff_t *pos);
+struct file *au_xino_create2(struct file *base_file, struct file *copy_src);
+struct file *au_xino_create(struct super_block *sb, char *fname, int silent);
+ino_t au_xino_new_ino(struct super_block *sb);
+void au_xino_delete_inode(struct inode *inode, const int unlinked);
+int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+		  ino_t ino);
+int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+		 ino_t *ino);
+int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino,
+	       struct file *base_file, int do_test);
+int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex);
+
+struct au_opt_xino;
+int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount);
+void au_xino_clr(struct super_block *sb);
+struct file *au_xino_def(struct super_block *sb);
+int au_xino_path(struct seq_file *seq, struct file *file);
+
+/* ---------------------------------------------------------------------- */
+
+/* Superblock to branch */
+static inline
+aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex)
+{
+	return au_sbr(sb, bindex)->br_id;
+}
+
+static inline
+struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex)
+{
+	return au_br_mnt(au_sbr(sb, bindex));
+}
+
+static inline
+struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex)
+{
+	return au_br_sb(au_sbr(sb, bindex));
+}
+
+static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex)
+{
+	atomic_dec(&au_sbr(sb, bindex)->br_count);
+}
+
+static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex)
+{
+	return au_sbr(sb, bindex)->br_perm;
+}
+
+static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex)
+{
+	return au_br_whable(au_sbr_perm(sb, bindex));
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * wbr_wh_read_lock, wbr_wh_write_lock
+ * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock
+ */
+AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem);
+
+#define WbrWhMustNoWaiters(wbr)	AuRwMustNoWaiters(&wbr->wbr_wh_rwsem)
+#define WbrWhMustAnyLock(wbr)	AuRwMustAnyLock(&wbr->wbr_wh_rwsem)
+#define WbrWhMustWriteLock(wbr)	AuRwMustWriteLock(&wbr->wbr_wh_rwsem)
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_FHSM
+static inline void au_br_fhsm_init(struct au_br_fhsm *brfhsm)
+{
+	mutex_init(&brfhsm->bf_lock);
+	brfhsm->bf_jiffy = 0;
+	brfhsm->bf_readable = 0;
+}
+
+static inline void au_br_fhsm_fin(struct au_br_fhsm *brfhsm)
+{
+	mutex_destroy(&brfhsm->bf_lock);
+}
+#else
+AuStubVoid(au_br_fhsm_init, struct au_br_fhsm *brfhsm)
+AuStubVoid(au_br_fhsm_fin, struct au_br_fhsm *brfhsm)
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_BRANCH_H__ */
diff --git b/fs/aufs/cpup.c b/fs/aufs/cpup.c
new file mode 100644
index 0000000..9ff0963
--- /dev/null
+++ b/fs/aufs/cpup.c
@@ -0,0 +1,1366 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * copy-up functions, see wbr_policy.c for copy-down
+ */
+
+#include <linux/fs_stack.h>
+#include <linux/mm.h>
+#include <linux/task_work.h>
+#include "aufs.h"
+
+void au_cpup_attr_flags(struct inode *dst, unsigned int iflags)
+{
+	const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE
+		| S_NOATIME | S_NOCMTIME | S_AUTOMOUNT;
+
+	BUILD_BUG_ON(sizeof(iflags) != sizeof(dst->i_flags));
+
+	dst->i_flags |= iflags & ~mask;
+	if (au_test_fs_notime(dst->i_sb))
+		dst->i_flags |= S_NOATIME | S_NOCMTIME;
+}
+
+void au_cpup_attr_timesizes(struct inode *inode)
+{
+	struct inode *h_inode;
+
+	h_inode = au_h_iptr(inode, au_ibstart(inode));
+	fsstack_copy_attr_times(inode, h_inode);
+	fsstack_copy_inode_size(inode, h_inode);
+}
+
+void au_cpup_attr_nlink(struct inode *inode, int force)
+{
+	struct inode *h_inode;
+	struct super_block *sb;
+	aufs_bindex_t bindex, bend;
+
+	sb = inode->i_sb;
+	bindex = au_ibstart(inode);
+	h_inode = au_h_iptr(inode, bindex);
+	if (!force
+	    && !S_ISDIR(h_inode->i_mode)
+	    && au_opt_test(au_mntflags(sb), PLINK)
+	    && au_plink_test(inode))
+		return;
+
+	/*
+	 * 0 can happen in revalidating.
+	 * h_inode->i_mutex may not be held here, but it is harmless since once
+	 * i_nlink reaches 0, it will never become positive except O_TMPFILE
+	 * case.
+	 * todo: O_TMPFILE+linkat(AT_SYMLINK_FOLLOW) bypassing aufs may cause
+	 *	 the incorrect link count.
+	 */
+	set_nlink(inode, h_inode->i_nlink);
+
+	/*
+	 * fewer nlink makes find(1) noisy, but larger nlink doesn't.
+	 * it may includes whplink directory.
+	 */
+	if (S_ISDIR(h_inode->i_mode)) {
+		bend = au_ibend(inode);
+		for (bindex++; bindex <= bend; bindex++) {
+			h_inode = au_h_iptr(inode, bindex);
+			if (h_inode)
+				au_add_nlink(inode, h_inode);
+		}
+	}
+}
+
+void au_cpup_attr_changeable(struct inode *inode)
+{
+	struct inode *h_inode;
+
+	h_inode = au_h_iptr(inode, au_ibstart(inode));
+	inode->i_mode = h_inode->i_mode;
+	inode->i_uid = h_inode->i_uid;
+	inode->i_gid = h_inode->i_gid;
+	au_cpup_attr_timesizes(inode);
+	au_cpup_attr_flags(inode, h_inode->i_flags);
+}
+
+void au_cpup_igen(struct inode *inode, struct inode *h_inode)
+{
+	struct au_iinfo *iinfo = au_ii(inode);
+
+	IiMustWriteLock(inode);
+
+	iinfo->ii_higen = h_inode->i_generation;
+	iinfo->ii_hsb1 = h_inode->i_sb;
+}
+
+void au_cpup_attr_all(struct inode *inode, int force)
+{
+	struct inode *h_inode;
+
+	h_inode = au_h_iptr(inode, au_ibstart(inode));
+	au_cpup_attr_changeable(inode);
+	if (inode->i_nlink > 0)
+		au_cpup_attr_nlink(inode, force);
+	inode->i_rdev = h_inode->i_rdev;
+	inode->i_blkbits = h_inode->i_blkbits;
+	au_cpup_igen(inode, h_inode);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */
+
+/* keep the timestamps of the parent dir when cpup */
+void au_dtime_store(struct au_dtime *dt, struct dentry *dentry,
+		    struct path *h_path)
+{
+	struct inode *h_inode;
+
+	dt->dt_dentry = dentry;
+	dt->dt_h_path = *h_path;
+	h_inode = d_inode(h_path->dentry);
+	dt->dt_atime = h_inode->i_atime;
+	dt->dt_mtime = h_inode->i_mtime;
+	/* smp_mb(); */
+}
+
+void au_dtime_revert(struct au_dtime *dt)
+{
+	struct iattr attr;
+	int err;
+
+	attr.ia_atime = dt->dt_atime;
+	attr.ia_mtime = dt->dt_mtime;
+	attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET
+		| ATTR_ATIME | ATTR_ATIME_SET;
+
+	/* no delegation since this is a directory */
+	err = vfsub_notify_change(&dt->dt_h_path, &attr, /*delegated*/NULL);
+	if (unlikely(err))
+		pr_warn("restoring timestamps failed(%d). ignored\n", err);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* internal use only */
+struct au_cpup_reg_attr {
+	int		valid;
+	struct kstat	st;
+	unsigned int	iflags; /* inode->i_flags */
+};
+
+static noinline_for_stack
+int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src,
+	       struct au_cpup_reg_attr *h_src_attr)
+{
+	int err, sbits, icex;
+	unsigned int mnt_flags;
+	unsigned char verbose;
+	struct iattr ia;
+	struct path h_path;
+	struct inode *h_isrc, *h_idst;
+	struct kstat *h_st;
+	struct au_branch *br;
+
+	h_path.dentry = au_h_dptr(dst, bindex);
+	h_idst = d_inode(h_path.dentry);
+	br = au_sbr(dst->d_sb, bindex);
+	h_path.mnt = au_br_mnt(br);
+	h_isrc = d_inode(h_src);
+	ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID
+		| ATTR_ATIME | ATTR_MTIME
+		| ATTR_ATIME_SET | ATTR_MTIME_SET;
+	if (h_src_attr && h_src_attr->valid) {
+		h_st = &h_src_attr->st;
+		ia.ia_uid = h_st->uid;
+		ia.ia_gid = h_st->gid;
+		ia.ia_atime = h_st->atime;
+		ia.ia_mtime = h_st->mtime;
+		if (h_idst->i_mode != h_st->mode
+		    && !S_ISLNK(h_idst->i_mode)) {
+			ia.ia_valid |= ATTR_MODE;
+			ia.ia_mode = h_st->mode;
+		}
+		sbits = !!(h_st->mode & (S_ISUID | S_ISGID));
+		au_cpup_attr_flags(h_idst, h_src_attr->iflags);
+	} else {
+		ia.ia_uid = h_isrc->i_uid;
+		ia.ia_gid = h_isrc->i_gid;
+		ia.ia_atime = h_isrc->i_atime;
+		ia.ia_mtime = h_isrc->i_mtime;
+		if (h_idst->i_mode != h_isrc->i_mode
+		    && !S_ISLNK(h_idst->i_mode)) {
+			ia.ia_valid |= ATTR_MODE;
+			ia.ia_mode = h_isrc->i_mode;
+		}
+		sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID));
+		au_cpup_attr_flags(h_idst, h_isrc->i_flags);
+	}
+	/* no delegation since it is just created */
+	err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL);
+
+	/* is this nfs only? */
+	if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) {
+		ia.ia_valid = ATTR_FORCE | ATTR_MODE;
+		ia.ia_mode = h_isrc->i_mode;
+		err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL);
+	}
+
+	icex = br->br_perm & AuBrAttr_ICEX;
+	if (!err) {
+		mnt_flags = au_mntflags(dst->d_sb);
+		verbose = !!au_opt_test(mnt_flags, VERBOSE);
+		err = au_cpup_xattr(h_path.dentry, h_src, icex, verbose);
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_do_copy_file(struct file *dst, struct file *src, loff_t len,
+			   char *buf, unsigned long blksize)
+{
+	int err;
+	size_t sz, rbytes, wbytes;
+	unsigned char all_zero;
+	char *p, *zp;
+	struct inode *h_inode;
+	/* reduce stack usage */
+	struct iattr *ia;
+
+	zp = page_address(ZERO_PAGE(0));
+	if (unlikely(!zp))
+		return -ENOMEM; /* possible? */
+
+	err = 0;
+	all_zero = 0;
+	while (len) {
+		AuDbg("len %lld\n", len);
+		sz = blksize;
+		if (len < blksize)
+			sz = len;
+
+		rbytes = 0;
+		/* todo: signal_pending? */
+		while (!rbytes || err == -EAGAIN || err == -EINTR) {
+			rbytes = vfsub_read_k(src, buf, sz, &src->f_pos);
+			err = rbytes;
+		}
+		if (unlikely(err < 0))
+			break;
+
+		all_zero = 0;
+		if (len >= rbytes && rbytes == blksize)
+			all_zero = !memcmp(buf, zp, rbytes);
+		if (!all_zero) {
+			wbytes = rbytes;
+			p = buf;
+			while (wbytes) {
+				size_t b;
+
+				b = vfsub_write_k(dst, p, wbytes, &dst->f_pos);
+				err = b;
+				/* todo: signal_pending? */
+				if (unlikely(err == -EAGAIN || err == -EINTR))
+					continue;
+				if (unlikely(err < 0))
+					break;
+				wbytes -= b;
+				p += b;
+			}
+			if (unlikely(err < 0))
+				break;
+		} else {
+			loff_t res;
+
+			AuLabel(hole);
+			res = vfsub_llseek(dst, rbytes, SEEK_CUR);
+			err = res;
+			if (unlikely(res < 0))
+				break;
+		}
+		len -= rbytes;
+		err = 0;
+	}
+
+	/* the last block may be a hole */
+	if (!err && all_zero) {
+		AuLabel(last hole);
+
+		err = 1;
+		if (au_test_nfs(dst->f_path.dentry->d_sb)) {
+			/* nfs requires this step to make last hole */
+			/* is this only nfs? */
+			do {
+				/* todo: signal_pending? */
+				err = vfsub_write_k(dst, "\0", 1, &dst->f_pos);
+			} while (err == -EAGAIN || err == -EINTR);
+			if (err == 1)
+				dst->f_pos--;
+		}
+
+		if (err == 1) {
+			ia = (void *)buf;
+			ia->ia_size = dst->f_pos;
+			ia->ia_valid = ATTR_SIZE | ATTR_FILE;
+			ia->ia_file = dst;
+			h_inode = file_inode(dst);
+			inode_lock_nested(h_inode, AuLsc_I_CHILD2);
+			/* no delegation since it is just created */
+			err = vfsub_notify_change(&dst->f_path, ia,
+						  /*delegated*/NULL);
+			inode_unlock(h_inode);
+		}
+	}
+
+	return err;
+}
+
+int au_copy_file(struct file *dst, struct file *src, loff_t len)
+{
+	int err;
+	unsigned long blksize;
+	unsigned char do_kfree;
+	char *buf;
+
+	err = -ENOMEM;
+	blksize = dst->f_path.dentry->d_sb->s_blocksize;
+	if (!blksize || PAGE_SIZE < blksize)
+		blksize = PAGE_SIZE;
+	AuDbg("blksize %lu\n", blksize);
+	do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *));
+	if (do_kfree)
+		buf = kmalloc(blksize, GFP_NOFS);
+	else
+		buf = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!buf))
+		goto out;
+
+	if (len > (1 << 22))
+		AuDbg("copying a large file %lld\n", (long long)len);
+
+	src->f_pos = 0;
+	dst->f_pos = 0;
+	err = au_do_copy_file(dst, src, len, buf, blksize);
+	if (do_kfree)
+		kfree(buf);
+	else
+		free_page((unsigned long)buf);
+
+out:
+	return err;
+}
+
+/*
+ * to support a sparse file which is opened with O_APPEND,
+ * we need to close the file.
+ */
+static int au_cp_regular(struct au_cp_generic *cpg)
+{
+	int err, i;
+	enum { SRC, DST };
+	struct {
+		aufs_bindex_t bindex;
+		unsigned int flags;
+		struct dentry *dentry;
+		int force_wr;
+		struct file *file;
+		void *label;
+	} *f, file[] = {
+		{
+			.bindex = cpg->bsrc,
+			.flags = O_RDONLY | O_NOATIME | O_LARGEFILE,
+			.label = &&out
+		},
+		{
+			.bindex = cpg->bdst,
+			.flags = O_WRONLY | O_NOATIME | O_LARGEFILE,
+			.force_wr = !!au_ftest_cpup(cpg->flags, RWDST),
+			.label = &&out_src
+		}
+	};
+	struct super_block *sb;
+	struct task_struct *tsk = current;
+
+	/* bsrc branch can be ro/rw. */
+	sb = cpg->dentry->d_sb;
+	f = file;
+	for (i = 0; i < 2; i++, f++) {
+		f->dentry = au_h_dptr(cpg->dentry, f->bindex);
+		f->file = au_h_open(cpg->dentry, f->bindex, f->flags,
+				    /*file*/NULL, f->force_wr);
+		err = PTR_ERR(f->file);
+		if (IS_ERR(f->file))
+			goto *f->label;
+	}
+
+	/* try stopping to update while we copyup */
+	IMustLock(d_inode(file[SRC].dentry));
+	err = au_copy_file(file[DST].file, file[SRC].file, cpg->len);
+
+	/* i wonder if we had O_NO_DELAY_FPUT flag */
+	if (tsk->flags & PF_KTHREAD)
+		__fput_sync(file[DST].file);
+	else {
+		WARN(1, "%pD\nPlease report this warning to aufs-users ML",
+		     file[DST].file);
+		fput(file[DST].file);
+		/*
+		 * too bad.
+		 * we have to call both since we don't know which place the file
+		 * was added to.
+		 */
+		task_work_run();
+		flush_delayed_fput();
+	}
+	au_sbr_put(sb, file[DST].bindex);
+
+out_src:
+	fput(file[SRC].file);
+	au_sbr_put(sb, file[SRC].bindex);
+out:
+	return err;
+}
+
+static int au_do_cpup_regular(struct au_cp_generic *cpg,
+			      struct au_cpup_reg_attr *h_src_attr)
+{
+	int err, rerr;
+	loff_t l;
+	struct path h_path;
+	struct inode *h_src_inode, *h_dst_inode;
+
+	err = 0;
+	h_src_inode = au_h_iptr(d_inode(cpg->dentry), cpg->bsrc);
+	l = i_size_read(h_src_inode);
+	if (cpg->len == -1 || l < cpg->len)
+		cpg->len = l;
+	if (cpg->len) {
+		/* try stopping to update while we are referencing */
+		inode_lock_nested(h_src_inode, AuLsc_I_CHILD);
+		au_pin_hdir_unlock(cpg->pin);
+
+		h_path.dentry = au_h_dptr(cpg->dentry, cpg->bsrc);
+		h_path.mnt = au_sbr_mnt(cpg->dentry->d_sb, cpg->bsrc);
+		h_src_attr->iflags = h_src_inode->i_flags;
+		if (!au_test_nfs(h_src_inode->i_sb))
+			err = vfs_getattr(&h_path, &h_src_attr->st);
+		else {
+			inode_unlock(h_src_inode);
+			err = vfs_getattr(&h_path, &h_src_attr->st);
+			inode_lock_nested(h_src_inode, AuLsc_I_CHILD);
+		}
+		if (unlikely(err)) {
+			inode_unlock(h_src_inode);
+			goto out;
+		}
+		h_src_attr->valid = 1;
+		err = au_cp_regular(cpg);
+		inode_unlock(h_src_inode);
+		rerr = au_pin_hdir_relock(cpg->pin);
+		if (!err && rerr)
+			err = rerr;
+	}
+	if (!err && (h_src_inode->i_state & I_LINKABLE)) {
+		h_path.dentry = au_h_dptr(cpg->dentry, cpg->bdst);
+		h_dst_inode = d_inode(h_path.dentry);
+		spin_lock(&h_dst_inode->i_lock);
+		h_dst_inode->i_state |= I_LINKABLE;
+		spin_unlock(&h_dst_inode->i_lock);
+	}
+
+out:
+	return err;
+}
+
+static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src,
+			      struct inode *h_dir)
+{
+	int err, symlen;
+	mm_segment_t old_fs;
+	union {
+		char *k;
+		char __user *u;
+	} sym;
+	struct inode *h_inode = d_inode(h_src);
+	const struct inode_operations *h_iop = h_inode->i_op;
+
+	err = -ENOSYS;
+	if (unlikely(!h_iop->readlink))
+		goto out;
+
+	err = -ENOMEM;
+	sym.k = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!sym.k))
+		goto out;
+
+	/* unnecessary to support mmap_sem since symlink is not mmap-able */
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	symlen = h_iop->readlink(h_src, sym.u, PATH_MAX);
+	err = symlen;
+	set_fs(old_fs);
+
+	if (symlen > 0) {
+		sym.k[symlen] = 0;
+		err = vfsub_symlink(h_dir, h_path, sym.k);
+	}
+	free_page((unsigned long)sym.k);
+
+out:
+	return err;
+}
+
+/*
+ * regardless 'acl' option, reset all ACL.
+ * All ACL will be copied up later from the original entry on the lower branch.
+ */
+static int au_reset_acl(struct inode *h_dir, struct path *h_path, umode_t mode)
+{
+	int err;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+
+	h_dentry = h_path->dentry;
+	h_inode = d_inode(h_dentry);
+	/* forget_all_cached_acls(h_inode)); */
+	err = vfsub_removexattr(h_dentry, XATTR_NAME_POSIX_ACL_ACCESS);
+	AuTraceErr(err);
+	if (err == -EOPNOTSUPP)
+		err = 0;
+	if (!err)
+		err = vfsub_acl_chmod(h_inode, mode);
+
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_do_cpup_dir(struct au_cp_generic *cpg, struct dentry *dst_parent,
+			  struct inode *h_dir, struct path *h_path)
+{
+	int err;
+	struct inode *dir, *inode;
+
+	err = vfsub_removexattr(h_path->dentry, XATTR_NAME_POSIX_ACL_DEFAULT);
+	AuTraceErr(err);
+	if (err == -EOPNOTSUPP)
+		err = 0;
+	if (unlikely(err))
+		goto out;
+
+	/*
+	 * strange behaviour from the users view,
+	 * particularry setattr case
+	 */
+	dir = d_inode(dst_parent);
+	if (au_ibstart(dir) == cpg->bdst)
+		au_cpup_attr_nlink(dir, /*force*/1);
+	inode = d_inode(cpg->dentry);
+	au_cpup_attr_nlink(inode, /*force*/1);
+
+out:
+	return err;
+}
+
+static noinline_for_stack
+int cpup_entry(struct au_cp_generic *cpg, struct dentry *dst_parent,
+	       struct au_cpup_reg_attr *h_src_attr)
+{
+	int err;
+	umode_t mode;
+	unsigned int mnt_flags;
+	unsigned char isdir, isreg, force;
+	const unsigned char do_dt = !!au_ftest_cpup(cpg->flags, DTIME);
+	struct au_dtime dt;
+	struct path h_path;
+	struct dentry *h_src, *h_dst, *h_parent;
+	struct inode *h_inode, *h_dir;
+	struct super_block *sb;
+
+	/* bsrc branch can be ro/rw. */
+	h_src = au_h_dptr(cpg->dentry, cpg->bsrc);
+	h_inode = d_inode(h_src);
+	AuDebugOn(h_inode != au_h_iptr(d_inode(cpg->dentry), cpg->bsrc));
+
+	/* try stopping to be referenced while we are creating */
+	h_dst = au_h_dptr(cpg->dentry, cpg->bdst);
+	if (au_ftest_cpup(cpg->flags, RENAME))
+		AuDebugOn(strncmp(h_dst->d_name.name, AUFS_WH_PFX,
+				  AUFS_WH_PFX_LEN));
+	h_parent = h_dst->d_parent; /* dir inode is locked */
+	h_dir = d_inode(h_parent);
+	IMustLock(h_dir);
+	AuDebugOn(h_parent != h_dst->d_parent);
+
+	sb = cpg->dentry->d_sb;
+	h_path.mnt = au_sbr_mnt(sb, cpg->bdst);
+	if (do_dt) {
+		h_path.dentry = h_parent;
+		au_dtime_store(&dt, dst_parent, &h_path);
+	}
+	h_path.dentry = h_dst;
+
+	isreg = 0;
+	isdir = 0;
+	mode = h_inode->i_mode;
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		isreg = 1;
+		err = vfsub_create(h_dir, &h_path, S_IRUSR | S_IWUSR,
+				   /*want_excl*/true);
+		if (!err)
+			err = au_do_cpup_regular(cpg, h_src_attr);
+		break;
+	case S_IFDIR:
+		isdir = 1;
+		err = vfsub_mkdir(h_dir, &h_path, mode);
+		if (!err)
+			err = au_do_cpup_dir(cpg, dst_parent, h_dir, &h_path);
+		break;
+	case S_IFLNK:
+		err = au_do_cpup_symlink(&h_path, h_src, h_dir);
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+		AuDebugOn(!capable(CAP_MKNOD));
+		/*FALLTHROUGH*/
+	case S_IFIFO:
+	case S_IFSOCK:
+		err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev);
+		break;
+	default:
+		AuIOErr("Unknown inode type 0%o\n", mode);
+		err = -EIO;
+	}
+	if (!err)
+		err = au_reset_acl(h_dir, &h_path, mode);
+
+	mnt_flags = au_mntflags(sb);
+	if (!au_opt_test(mnt_flags, UDBA_NONE)
+	    && !isdir
+	    && au_opt_test(mnt_flags, XINO)
+	    && (h_inode->i_nlink == 1
+		|| (h_inode->i_state & I_LINKABLE))
+	    /* todo: unnecessary? */
+	    /* && d_inode(cpg->dentry)->i_nlink == 1 */
+	    && cpg->bdst < cpg->bsrc
+	    && !au_ftest_cpup(cpg->flags, KEEPLINO))
+		au_xino_write(sb, cpg->bsrc, h_inode->i_ino, /*ino*/0);
+		/* ignore this error */
+
+	if (!err) {
+		force = 0;
+		if (isreg) {
+			force = !!cpg->len;
+			if (cpg->len == -1)
+				force = !!i_size_read(h_inode);
+		}
+		au_fhsm_wrote(sb, cpg->bdst, force);
+	}
+
+	if (do_dt)
+		au_dtime_revert(&dt);
+	return err;
+}
+
+static int au_do_ren_after_cpup(struct au_cp_generic *cpg, struct path *h_path)
+{
+	int err;
+	struct dentry *dentry, *h_dentry, *h_parent, *parent;
+	struct inode *h_dir;
+	aufs_bindex_t bdst;
+
+	dentry = cpg->dentry;
+	bdst = cpg->bdst;
+	h_dentry = au_h_dptr(dentry, bdst);
+	if (!au_ftest_cpup(cpg->flags, OVERWRITE)) {
+		dget(h_dentry);
+		au_set_h_dptr(dentry, bdst, NULL);
+		err = au_lkup_neg(dentry, bdst, /*wh*/0);
+		if (!err)
+			h_path->dentry = dget(au_h_dptr(dentry, bdst));
+		au_set_h_dptr(dentry, bdst, h_dentry);
+	} else {
+		err = 0;
+		parent = dget_parent(dentry);
+		h_parent = au_h_dptr(parent, bdst);
+		dput(parent);
+		h_path->dentry = vfsub_lkup_one(&dentry->d_name, h_parent);
+		if (IS_ERR(h_path->dentry))
+			err = PTR_ERR(h_path->dentry);
+	}
+	if (unlikely(err))
+		goto out;
+
+	h_parent = h_dentry->d_parent; /* dir inode is locked */
+	h_dir = d_inode(h_parent);
+	IMustLock(h_dir);
+	AuDbg("%pd %pd\n", h_dentry, h_path->dentry);
+	/* no delegation since it is just created */
+	err = vfsub_rename(h_dir, h_dentry, h_dir, h_path, /*delegated*/NULL);
+	dput(h_path->dentry);
+
+out:
+	return err;
+}
+
+/*
+ * copyup the @dentry from @bsrc to @bdst.
+ * the caller must set the both of lower dentries.
+ * @len is for truncating when it is -1 copyup the entire file.
+ * in link/rename cases, @dst_parent may be different from the real one.
+ * basic->bsrc can be larger than basic->bdst.
+ */
+static int au_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent)
+{
+	int err, rerr;
+	aufs_bindex_t old_ibstart;
+	unsigned char isdir, plink;
+	struct dentry *h_src, *h_dst, *h_parent;
+	struct inode *dst_inode, *h_dir, *inode, *delegated, *src_inode;
+	struct super_block *sb;
+	struct au_branch *br;
+	/* to reuduce stack size */
+	struct {
+		struct au_dtime dt;
+		struct path h_path;
+		struct au_cpup_reg_attr h_src_attr;
+	} *a;
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+	a->h_src_attr.valid = 0;
+
+	sb = cpg->dentry->d_sb;
+	br = au_sbr(sb, cpg->bdst);
+	a->h_path.mnt = au_br_mnt(br);
+	h_dst = au_h_dptr(cpg->dentry, cpg->bdst);
+	h_parent = h_dst->d_parent; /* dir inode is locked */
+	h_dir = d_inode(h_parent);
+	IMustLock(h_dir);
+
+	h_src = au_h_dptr(cpg->dentry, cpg->bsrc);
+	inode = d_inode(cpg->dentry);
+
+	if (!dst_parent)
+		dst_parent = dget_parent(cpg->dentry);
+	else
+		dget(dst_parent);
+
+	plink = !!au_opt_test(au_mntflags(sb), PLINK);
+	dst_inode = au_h_iptr(inode, cpg->bdst);
+	if (dst_inode) {
+		if (unlikely(!plink)) {
+			err = -EIO;
+			AuIOErr("hi%lu(i%lu) exists on b%d "
+				"but plink is disabled\n",
+				dst_inode->i_ino, inode->i_ino, cpg->bdst);
+			goto out_parent;
+		}
+
+		if (dst_inode->i_nlink) {
+			const int do_dt = au_ftest_cpup(cpg->flags, DTIME);
+
+			h_src = au_plink_lkup(inode, cpg->bdst);
+			err = PTR_ERR(h_src);
+			if (IS_ERR(h_src))
+				goto out_parent;
+			if (unlikely(d_is_negative(h_src))) {
+				err = -EIO;
+				AuIOErr("i%lu exists on b%d "
+					"but not pseudo-linked\n",
+					inode->i_ino, cpg->bdst);
+				dput(h_src);
+				goto out_parent;
+			}
+
+			if (do_dt) {
+				a->h_path.dentry = h_parent;
+				au_dtime_store(&a->dt, dst_parent, &a->h_path);
+			}
+
+			a->h_path.dentry = h_dst;
+			delegated = NULL;
+			err = vfsub_link(h_src, h_dir, &a->h_path, &delegated);
+			if (!err && au_ftest_cpup(cpg->flags, RENAME))
+				err = au_do_ren_after_cpup(cpg, &a->h_path);
+			if (do_dt)
+				au_dtime_revert(&a->dt);
+			if (unlikely(err == -EWOULDBLOCK)) {
+				pr_warn("cannot retry for NFSv4 delegation"
+					" for an internal link\n");
+				iput(delegated);
+			}
+			dput(h_src);
+			goto out_parent;
+		} else
+			/* todo: cpup_wh_file? */
+			/* udba work */
+			au_update_ibrange(inode, /*do_put_zero*/1);
+	}
+
+	isdir = S_ISDIR(inode->i_mode);
+	old_ibstart = au_ibstart(inode);
+	err = cpup_entry(cpg, dst_parent, &a->h_src_attr);
+	if (unlikely(err))
+		goto out_rev;
+	dst_inode = d_inode(h_dst);
+	inode_lock_nested(dst_inode, AuLsc_I_CHILD2);
+	/* todo: necessary? */
+	/* au_pin_hdir_unlock(cpg->pin); */
+
+	err = cpup_iattr(cpg->dentry, cpg->bdst, h_src, &a->h_src_attr);
+	if (unlikely(err)) {
+		/* todo: necessary? */
+		/* au_pin_hdir_relock(cpg->pin); */ /* ignore an error */
+		inode_unlock(dst_inode);
+		goto out_rev;
+	}
+
+	if (cpg->bdst < old_ibstart) {
+		if (S_ISREG(inode->i_mode)) {
+			err = au_dy_iaop(inode, cpg->bdst, dst_inode);
+			if (unlikely(err)) {
+				/* ignore an error */
+				/* au_pin_hdir_relock(cpg->pin); */
+				inode_unlock(dst_inode);
+				goto out_rev;
+			}
+		}
+		au_set_ibstart(inode, cpg->bdst);
+	} else
+		au_set_ibend(inode, cpg->bdst);
+	au_set_h_iptr(inode, cpg->bdst, au_igrab(dst_inode),
+		      au_hi_flags(inode, isdir));
+
+	/* todo: necessary? */
+	/* err = au_pin_hdir_relock(cpg->pin); */
+	inode_unlock(dst_inode);
+	if (unlikely(err))
+		goto out_rev;
+
+	src_inode = d_inode(h_src);
+	if (!isdir
+	    && (src_inode->i_nlink > 1
+		|| src_inode->i_state & I_LINKABLE)
+	    && plink)
+		au_plink_append(inode, cpg->bdst, h_dst);
+
+	if (au_ftest_cpup(cpg->flags, RENAME)) {
+		a->h_path.dentry = h_dst;
+		err = au_do_ren_after_cpup(cpg, &a->h_path);
+	}
+	if (!err)
+		goto out_parent; /* success */
+
+	/* revert */
+out_rev:
+	a->h_path.dentry = h_parent;
+	au_dtime_store(&a->dt, dst_parent, &a->h_path);
+	a->h_path.dentry = h_dst;
+	rerr = 0;
+	if (d_is_positive(h_dst)) {
+		if (!isdir) {
+			/* no delegation since it is just created */
+			rerr = vfsub_unlink(h_dir, &a->h_path,
+					    /*delegated*/NULL, /*force*/0);
+		} else
+			rerr = vfsub_rmdir(h_dir, &a->h_path);
+	}
+	au_dtime_revert(&a->dt);
+	if (rerr) {
+		AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr);
+		err = -EIO;
+	}
+out_parent:
+	dput(dst_parent);
+	kfree(a);
+out:
+	return err;
+}
+
+#if 0 /* reserved */
+struct au_cpup_single_args {
+	int *errp;
+	struct au_cp_generic *cpg;
+	struct dentry *dst_parent;
+};
+
+static void au_call_cpup_single(void *args)
+{
+	struct au_cpup_single_args *a = args;
+
+	au_pin_hdir_acquire_nest(a->cpg->pin);
+	*a->errp = au_cpup_single(a->cpg, a->dst_parent);
+	au_pin_hdir_release(a->cpg->pin);
+}
+#endif
+
+/*
+ * prevent SIGXFSZ in copy-up.
+ * testing CAP_MKNOD is for generic fs,
+ * but CAP_FSETID is for xfs only, currently.
+ */
+static int au_cpup_sio_test(struct au_pin *pin, umode_t mode)
+{
+	int do_sio;
+	struct super_block *sb;
+	struct inode *h_dir;
+
+	do_sio = 0;
+	sb = au_pinned_parent(pin)->d_sb;
+	if (!au_wkq_test()
+	    && (!au_sbi(sb)->si_plink_maint_pid
+		|| au_plink_maint(sb, AuLock_NOPLM))) {
+		switch (mode & S_IFMT) {
+		case S_IFREG:
+			/* no condition about RLIMIT_FSIZE and the file size */
+			do_sio = 1;
+			break;
+		case S_IFCHR:
+		case S_IFBLK:
+			do_sio = !capable(CAP_MKNOD);
+			break;
+		}
+		if (!do_sio)
+			do_sio = ((mode & (S_ISUID | S_ISGID))
+				  && !capable(CAP_FSETID));
+		/* this workaround may be removed in the future */
+		if (!do_sio) {
+			h_dir = au_pinned_h_dir(pin);
+			do_sio = h_dir->i_mode & S_ISVTX;
+		}
+	}
+
+	return do_sio;
+}
+
+#if 0 /* reserved */
+int au_sio_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent)
+{
+	int err, wkq_err;
+	struct dentry *h_dentry;
+
+	h_dentry = au_h_dptr(cpg->dentry, cpg->bsrc);
+	if (!au_cpup_sio_test(pin, d_inode(h_dentry)->i_mode))
+		err = au_cpup_single(cpg, dst_parent);
+	else {
+		struct au_cpup_single_args args = {
+			.errp		= &err,
+			.cpg		= cpg,
+			.dst_parent	= dst_parent
+		};
+		wkq_err = au_wkq_wait(au_call_cpup_single, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	return err;
+}
+#endif
+
+/*
+ * copyup the @dentry from the first active lower branch to @bdst,
+ * using au_cpup_single().
+ */
+static int au_cpup_simple(struct au_cp_generic *cpg)
+{
+	int err;
+	unsigned int flags_orig;
+	struct dentry *dentry;
+
+	AuDebugOn(cpg->bsrc < 0);
+
+	dentry = cpg->dentry;
+	DiMustWriteLock(dentry);
+
+	err = au_lkup_neg(dentry, cpg->bdst, /*wh*/1);
+	if (!err) {
+		flags_orig = cpg->flags;
+		au_fset_cpup(cpg->flags, RENAME);
+		err = au_cpup_single(cpg, NULL);
+		cpg->flags = flags_orig;
+		if (!err)
+			return 0; /* success */
+
+		/* revert */
+		au_set_h_dptr(dentry, cpg->bdst, NULL);
+		au_set_dbstart(dentry, cpg->bsrc);
+	}
+
+	return err;
+}
+
+struct au_cpup_simple_args {
+	int *errp;
+	struct au_cp_generic *cpg;
+};
+
+static void au_call_cpup_simple(void *args)
+{
+	struct au_cpup_simple_args *a = args;
+
+	au_pin_hdir_acquire_nest(a->cpg->pin);
+	*a->errp = au_cpup_simple(a->cpg);
+	au_pin_hdir_release(a->cpg->pin);
+}
+
+static int au_do_sio_cpup_simple(struct au_cp_generic *cpg)
+{
+	int err, wkq_err;
+	struct dentry *dentry, *parent;
+	struct file *h_file;
+	struct inode *h_dir;
+
+	dentry = cpg->dentry;
+	h_file = NULL;
+	if (au_ftest_cpup(cpg->flags, HOPEN)) {
+		AuDebugOn(cpg->bsrc < 0);
+		h_file = au_h_open_pre(dentry, cpg->bsrc, /*force_wr*/0);
+		err = PTR_ERR(h_file);
+		if (IS_ERR(h_file))
+			goto out;
+	}
+
+	parent = dget_parent(dentry);
+	h_dir = au_h_iptr(d_inode(parent), cpg->bdst);
+	if (!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE)
+	    && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode))
+		err = au_cpup_simple(cpg);
+	else {
+		struct au_cpup_simple_args args = {
+			.errp		= &err,
+			.cpg		= cpg
+		};
+		wkq_err = au_wkq_wait(au_call_cpup_simple, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	dput(parent);
+	if (h_file)
+		au_h_open_post(dentry, cpg->bsrc, h_file);
+
+out:
+	return err;
+}
+
+int au_sio_cpup_simple(struct au_cp_generic *cpg)
+{
+	aufs_bindex_t bsrc, bend;
+	struct dentry *dentry, *h_dentry;
+
+	if (cpg->bsrc < 0) {
+		dentry = cpg->dentry;
+		bend = au_dbend(dentry);
+		for (bsrc = cpg->bdst + 1; bsrc <= bend; bsrc++) {
+			h_dentry = au_h_dptr(dentry, bsrc);
+			if (h_dentry) {
+				AuDebugOn(d_is_negative(h_dentry));
+				break;
+			}
+		}
+		AuDebugOn(bsrc > bend);
+		cpg->bsrc = bsrc;
+	}
+	AuDebugOn(cpg->bsrc <= cpg->bdst);
+	return au_do_sio_cpup_simple(cpg);
+}
+
+int au_sio_cpdown_simple(struct au_cp_generic *cpg)
+{
+	AuDebugOn(cpg->bdst <= cpg->bsrc);
+	return au_do_sio_cpup_simple(cpg);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * copyup the deleted file for writing.
+ */
+static int au_do_cpup_wh(struct au_cp_generic *cpg, struct dentry *wh_dentry,
+			 struct file *file)
+{
+	int err;
+	unsigned int flags_orig;
+	aufs_bindex_t bsrc_orig;
+	struct dentry *h_d_dst, *h_d_start;
+	struct au_dinfo *dinfo;
+	struct au_hdentry *hdp;
+
+	dinfo = au_di(cpg->dentry);
+	AuRwMustWriteLock(&dinfo->di_rwsem);
+
+	bsrc_orig = cpg->bsrc;
+	cpg->bsrc = dinfo->di_bstart;
+	hdp = dinfo->di_hdentry;
+	h_d_dst = hdp[0 + cpg->bdst].hd_dentry;
+	dinfo->di_bstart = cpg->bdst;
+	hdp[0 + cpg->bdst].hd_dentry = wh_dentry;
+	h_d_start = NULL;
+	if (file) {
+		h_d_start = hdp[0 + cpg->bsrc].hd_dentry;
+		hdp[0 + cpg->bsrc].hd_dentry = au_hf_top(file)->f_path.dentry;
+	}
+	flags_orig = cpg->flags;
+	cpg->flags = !AuCpup_DTIME;
+	err = au_cpup_single(cpg, /*h_parent*/NULL);
+	cpg->flags = flags_orig;
+	if (file) {
+		if (!err)
+			err = au_reopen_nondir(file);
+		hdp[0 + cpg->bsrc].hd_dentry = h_d_start;
+	}
+	hdp[0 + cpg->bdst].hd_dentry = h_d_dst;
+	dinfo->di_bstart = cpg->bsrc;
+	cpg->bsrc = bsrc_orig;
+
+	return err;
+}
+
+static int au_cpup_wh(struct au_cp_generic *cpg, struct file *file)
+{
+	int err;
+	aufs_bindex_t bdst;
+	struct au_dtime dt;
+	struct dentry *dentry, *parent, *h_parent, *wh_dentry;
+	struct au_branch *br;
+	struct path h_path;
+
+	dentry = cpg->dentry;
+	bdst = cpg->bdst;
+	br = au_sbr(dentry->d_sb, bdst);
+	parent = dget_parent(dentry);
+	h_parent = au_h_dptr(parent, bdst);
+	wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out;
+
+	h_path.dentry = h_parent;
+	h_path.mnt = au_br_mnt(br);
+	au_dtime_store(&dt, parent, &h_path);
+	err = au_do_cpup_wh(cpg, wh_dentry, file);
+	if (unlikely(err))
+		goto out_wh;
+
+	dget(wh_dentry);
+	h_path.dentry = wh_dentry;
+	if (!d_is_dir(wh_dentry)) {
+		/* no delegation since it is just created */
+		err = vfsub_unlink(d_inode(h_parent), &h_path,
+				   /*delegated*/NULL, /*force*/0);
+	} else
+		err = vfsub_rmdir(d_inode(h_parent), &h_path);
+	if (unlikely(err)) {
+		AuIOErr("failed remove copied-up tmp file %pd(%d)\n",
+			wh_dentry, err);
+		err = -EIO;
+	}
+	au_dtime_revert(&dt);
+	au_set_hi_wh(d_inode(dentry), bdst, wh_dentry);
+
+out_wh:
+	dput(wh_dentry);
+out:
+	dput(parent);
+	return err;
+}
+
+struct au_cpup_wh_args {
+	int *errp;
+	struct au_cp_generic *cpg;
+	struct file *file;
+};
+
+static void au_call_cpup_wh(void *args)
+{
+	struct au_cpup_wh_args *a = args;
+
+	au_pin_hdir_acquire_nest(a->cpg->pin);
+	*a->errp = au_cpup_wh(a->cpg, a->file);
+	au_pin_hdir_release(a->cpg->pin);
+}
+
+int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file)
+{
+	int err, wkq_err;
+	aufs_bindex_t bdst;
+	struct dentry *dentry, *parent, *h_orph, *h_parent;
+	struct inode *dir, *h_dir, *h_tmpdir;
+	struct au_wbr *wbr;
+	struct au_pin wh_pin, *pin_orig;
+
+	dentry = cpg->dentry;
+	bdst = cpg->bdst;
+	parent = dget_parent(dentry);
+	dir = d_inode(parent);
+	h_orph = NULL;
+	h_parent = NULL;
+	h_dir = au_igrab(au_h_iptr(dir, bdst));
+	h_tmpdir = h_dir;
+	pin_orig = NULL;
+	if (!h_dir->i_nlink) {
+		wbr = au_sbr(dentry->d_sb, bdst)->br_wbr;
+		h_orph = wbr->wbr_orph;
+
+		h_parent = dget(au_h_dptr(parent, bdst));
+		au_set_h_dptr(parent, bdst, dget(h_orph));
+		h_tmpdir = d_inode(h_orph);
+		au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0);
+
+		inode_lock_nested(h_tmpdir, AuLsc_I_PARENT3);
+		/* todo: au_h_open_pre()? */
+
+		pin_orig = cpg->pin;
+		au_pin_init(&wh_pin, dentry, bdst, AuLsc_DI_PARENT,
+			    AuLsc_I_PARENT3, cpg->pin->udba, AuPin_DI_LOCKED);
+		cpg->pin = &wh_pin;
+	}
+
+	if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE)
+	    && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode))
+		err = au_cpup_wh(cpg, file);
+	else {
+		struct au_cpup_wh_args args = {
+			.errp	= &err,
+			.cpg	= cpg,
+			.file	= file
+		};
+		wkq_err = au_wkq_wait(au_call_cpup_wh, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	if (h_orph) {
+		inode_unlock(h_tmpdir);
+		/* todo: au_h_open_post()? */
+		au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0);
+		au_set_h_dptr(parent, bdst, h_parent);
+		AuDebugOn(!pin_orig);
+		cpg->pin = pin_orig;
+	}
+	iput(h_dir);
+	dput(parent);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * generic routine for both of copy-up and copy-down.
+ */
+/* cf. revalidate function in file.c */
+int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst,
+	       int (*cp)(struct dentry *dentry, aufs_bindex_t bdst,
+			 struct au_pin *pin,
+			 struct dentry *h_parent, void *arg),
+	       void *arg)
+{
+	int err;
+	struct au_pin pin;
+	struct dentry *d, *parent, *h_parent, *real_parent, *h_dentry;
+
+	err = 0;
+	parent = dget_parent(dentry);
+	if (IS_ROOT(parent))
+		goto out;
+
+	au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2,
+		    au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE);
+
+	/* do not use au_dpage */
+	real_parent = parent;
+	while (1) {
+		dput(parent);
+		parent = dget_parent(dentry);
+		h_parent = au_h_dptr(parent, bdst);
+		if (h_parent)
+			goto out; /* success */
+
+		/* find top dir which is necessary to cpup */
+		do {
+			d = parent;
+			dput(parent);
+			parent = dget_parent(d);
+			di_read_lock_parent3(parent, !AuLock_IR);
+			h_parent = au_h_dptr(parent, bdst);
+			di_read_unlock(parent, !AuLock_IR);
+		} while (!h_parent);
+
+		if (d != real_parent)
+			di_write_lock_child3(d);
+
+		/* somebody else might create while we were sleeping */
+		h_dentry = au_h_dptr(d, bdst);
+		if (!h_dentry || d_is_negative(h_dentry)) {
+			if (h_dentry)
+				au_update_dbstart(d);
+
+			au_pin_set_dentry(&pin, d);
+			err = au_do_pin(&pin);
+			if (!err) {
+				err = cp(d, bdst, &pin, h_parent, arg);
+				au_unpin(&pin);
+			}
+		}
+
+		if (d != real_parent)
+			di_write_unlock(d);
+		if (unlikely(err))
+			break;
+	}
+
+out:
+	dput(parent);
+	return err;
+}
+
+static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst,
+		       struct au_pin *pin,
+		       struct dentry *h_parent __maybe_unused,
+		       void *arg __maybe_unused)
+{
+	struct au_cp_generic cpg = {
+		.dentry	= dentry,
+		.bdst	= bdst,
+		.bsrc	= -1,
+		.len	= 0,
+		.pin	= pin,
+		.flags	= AuCpup_DTIME
+	};
+	return au_sio_cpup_simple(&cpg);
+}
+
+int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst)
+{
+	return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL);
+}
+
+int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst)
+{
+	int err;
+	struct dentry *parent;
+	struct inode *dir;
+
+	parent = dget_parent(dentry);
+	dir = d_inode(parent);
+	err = 0;
+	if (au_h_iptr(dir, bdst))
+		goto out;
+
+	di_read_unlock(parent, AuLock_IR);
+	di_write_lock_parent(parent);
+	/* someone else might change our inode while we were sleeping */
+	if (!au_h_iptr(dir, bdst))
+		err = au_cpup_dirs(dentry, bdst);
+	di_downgrade_lock(parent, AuLock_IR);
+
+out:
+	dput(parent);
+	return err;
+}
diff --git b/fs/aufs/cpup.h b/fs/aufs/cpup.h
new file mode 100644
index 0000000..ccba2c4
--- /dev/null
+++ b/fs/aufs/cpup.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * copy-up/down functions
+ */
+
+#ifndef __AUFS_CPUP_H__
+#define __AUFS_CPUP_H__
+
+#ifdef __KERNEL__
+
+#include <linux/path.h>
+
+struct inode;
+struct file;
+struct au_pin;
+
+void au_cpup_attr_flags(struct inode *dst, unsigned int iflags);
+void au_cpup_attr_timesizes(struct inode *inode);
+void au_cpup_attr_nlink(struct inode *inode, int force);
+void au_cpup_attr_changeable(struct inode *inode);
+void au_cpup_igen(struct inode *inode, struct inode *h_inode);
+void au_cpup_attr_all(struct inode *inode, int force);
+
+/* ---------------------------------------------------------------------- */
+
+struct au_cp_generic {
+	struct dentry	*dentry;
+	aufs_bindex_t	bdst, bsrc;
+	loff_t		len;
+	struct au_pin	*pin;
+	unsigned int	flags;
+};
+
+/* cpup flags */
+#define AuCpup_DTIME		1		/* do dtime_store/revert */
+#define AuCpup_KEEPLINO		(1 << 1)	/* do not clear the lower xino,
+						   for link(2) */
+#define AuCpup_RENAME		(1 << 2)	/* rename after cpup */
+#define AuCpup_HOPEN		(1 << 3)	/* call h_open_pre/post() in
+						   cpup */
+#define AuCpup_OVERWRITE	(1 << 4)	/* allow overwriting the
+						   existing entry */
+#define AuCpup_RWDST		(1 << 5)	/* force write target even if
+						   the branch is marked as RO */
+
+#define au_ftest_cpup(flags, name)	((flags) & AuCpup_##name)
+#define au_fset_cpup(flags, name) \
+	do { (flags) |= AuCpup_##name; } while (0)
+#define au_fclr_cpup(flags, name) \
+	do { (flags) &= ~AuCpup_##name; } while (0)
+
+int au_copy_file(struct file *dst, struct file *src, loff_t len);
+int au_sio_cpup_simple(struct au_cp_generic *cpg);
+int au_sio_cpdown_simple(struct au_cp_generic *cpg);
+int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file);
+
+int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst,
+	       int (*cp)(struct dentry *dentry, aufs_bindex_t bdst,
+			 struct au_pin *pin,
+			 struct dentry *h_parent, void *arg),
+	       void *arg);
+int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst);
+int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst);
+
+/* ---------------------------------------------------------------------- */
+
+/* keep timestamps when copyup */
+struct au_dtime {
+	struct dentry *dt_dentry;
+	struct path dt_h_path;
+	struct timespec dt_atime, dt_mtime;
+};
+void au_dtime_store(struct au_dtime *dt, struct dentry *dentry,
+		    struct path *h_path);
+void au_dtime_revert(struct au_dtime *dt);
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_CPUP_H__ */
diff --git b/fs/aufs/dbgaufs.c b/fs/aufs/dbgaufs.c
new file mode 100644
index 0000000..0aefb5e
--- /dev/null
+++ b/fs/aufs/dbgaufs.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * debugfs interface
+ */
+
+#include <linux/debugfs.h>
+#include "aufs.h"
+
+#ifndef CONFIG_SYSFS
+#error DEBUG_FS depends upon SYSFS
+#endif
+
+static struct dentry *dbgaufs;
+static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH;
+
+/* 20 is max digits length of ulong 64 */
+struct dbgaufs_arg {
+	int n;
+	char a[20 * 4];
+};
+
+/*
+ * common function for all XINO files
+ */
+static int dbgaufs_xi_release(struct inode *inode __maybe_unused,
+			      struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt)
+{
+	int err;
+	struct kstat st;
+	struct dbgaufs_arg *p;
+
+	err = -ENOMEM;
+	p = kmalloc(sizeof(*p), GFP_NOFS);
+	if (unlikely(!p))
+		goto out;
+
+	err = 0;
+	p->n = 0;
+	file->private_data = p;
+	if (!xf)
+		goto out;
+
+	err = vfs_getattr(&xf->f_path, &st);
+	if (!err) {
+		if (do_fcnt)
+			p->n = snprintf
+				(p->a, sizeof(p->a), "%ld, %llux%lu %lld\n",
+				 (long)file_count(xf), st.blocks, st.blksize,
+				 (long long)st.size);
+		else
+			p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n",
+					st.blocks, st.blksize,
+					(long long)st.size);
+		AuDebugOn(p->n >= sizeof(p->a));
+	} else {
+		p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err);
+		err = 0;
+	}
+
+out:
+	return err;
+
+}
+
+static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct dbgaufs_arg *p;
+
+	p = file->private_data;
+	return simple_read_from_buffer(buf, count, ppos, p->a, p->n);
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct dbgaufs_plink_arg {
+	int n;
+	char a[];
+};
+
+static int dbgaufs_plink_release(struct inode *inode __maybe_unused,
+				 struct file *file)
+{
+	free_page((unsigned long)file->private_data);
+	return 0;
+}
+
+static int dbgaufs_plink_open(struct inode *inode, struct file *file)
+{
+	int err, i, limit;
+	unsigned long n, sum;
+	struct dbgaufs_plink_arg *p;
+	struct au_sbinfo *sbinfo;
+	struct super_block *sb;
+	struct au_sphlhead *sphl;
+
+	err = -ENOMEM;
+	p = (void *)get_zeroed_page(GFP_NOFS);
+	if (unlikely(!p))
+		goto out;
+
+	err = -EFBIG;
+	sbinfo = inode->i_private;
+	sb = sbinfo->si_sb;
+	si_noflush_read_lock(sb);
+	if (au_opt_test(au_mntflags(sb), PLINK)) {
+		limit = PAGE_SIZE - sizeof(p->n);
+
+		/* the number of buckets */
+		n = snprintf(p->a + p->n, limit, "%d\n", AuPlink_NHASH);
+		p->n += n;
+		limit -= n;
+
+		sum = 0;
+		for (i = 0, sphl = sbinfo->si_plink;
+		     i < AuPlink_NHASH;
+		     i++, sphl++) {
+			n = au_sphl_count(sphl);
+			sum += n;
+
+			n = snprintf(p->a + p->n, limit, "%lu ", n);
+			p->n += n;
+			limit -= n;
+			if (unlikely(limit <= 0))
+				goto out_free;
+		}
+		p->a[p->n - 1] = '\n';
+
+		/* the sum of plinks */
+		n = snprintf(p->a + p->n, limit, "%lu\n", sum);
+		p->n += n;
+		limit -= n;
+		if (unlikely(limit <= 0))
+			goto out_free;
+	} else {
+#define str "1\n0\n0\n"
+		p->n = sizeof(str) - 1;
+		strcpy(p->a, str);
+#undef str
+	}
+	si_read_unlock(sb);
+
+	err = 0;
+	file->private_data = p;
+	goto out; /* success */
+
+out_free:
+	free_page((unsigned long)p);
+out:
+	return err;
+}
+
+static ssize_t dbgaufs_plink_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct dbgaufs_plink_arg *p;
+
+	p = file->private_data;
+	return simple_read_from_buffer(buf, count, ppos, p->a, p->n);
+}
+
+static const struct file_operations dbgaufs_plink_fop = {
+	.owner		= THIS_MODULE,
+	.open		= dbgaufs_plink_open,
+	.release	= dbgaufs_plink_release,
+	.read		= dbgaufs_plink_read
+};
+
+/* ---------------------------------------------------------------------- */
+
+static int dbgaufs_xib_open(struct inode *inode, struct file *file)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+	struct super_block *sb;
+
+	sbinfo = inode->i_private;
+	sb = sbinfo->si_sb;
+	si_noflush_read_lock(sb);
+	err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0);
+	si_read_unlock(sb);
+	return err;
+}
+
+static const struct file_operations dbgaufs_xib_fop = {
+	.owner		= THIS_MODULE,
+	.open		= dbgaufs_xib_open,
+	.release	= dbgaufs_xi_release,
+	.read		= dbgaufs_xi_read
+};
+
+/* ---------------------------------------------------------------------- */
+
+#define DbgaufsXi_PREFIX "xi"
+
+static int dbgaufs_xino_open(struct inode *inode, struct file *file)
+{
+	int err;
+	long l;
+	struct au_sbinfo *sbinfo;
+	struct super_block *sb;
+	struct file *xf;
+	struct qstr *name;
+
+	err = -ENOENT;
+	xf = NULL;
+	name = &file->f_path.dentry->d_name;
+	if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX)
+		     || memcmp(name->name, DbgaufsXi_PREFIX,
+			       sizeof(DbgaufsXi_PREFIX) - 1)))
+		goto out;
+	err = kstrtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l);
+	if (unlikely(err))
+		goto out;
+
+	sbinfo = inode->i_private;
+	sb = sbinfo->si_sb;
+	si_noflush_read_lock(sb);
+	if (l <= au_sbend(sb)) {
+		xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file;
+		err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1);
+	} else
+		err = -ENOENT;
+	si_read_unlock(sb);
+
+out:
+	return err;
+}
+
+static const struct file_operations dbgaufs_xino_fop = {
+	.owner		= THIS_MODULE,
+	.open		= dbgaufs_xino_open,
+	.release	= dbgaufs_xi_release,
+	.read		= dbgaufs_xi_read
+};
+
+void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex)
+{
+	aufs_bindex_t bend;
+	struct au_branch *br;
+	struct au_xino_file *xi;
+
+	if (!au_sbi(sb)->si_dbgaufs)
+		return;
+
+	bend = au_sbend(sb);
+	for (; bindex <= bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		xi = &br->br_xino;
+		debugfs_remove(xi->xi_dbgaufs);
+		xi->xi_dbgaufs = NULL;
+	}
+}
+
+void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex)
+{
+	struct au_sbinfo *sbinfo;
+	struct dentry *parent;
+	struct au_branch *br;
+	struct au_xino_file *xi;
+	aufs_bindex_t bend;
+	char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */
+
+	sbinfo = au_sbi(sb);
+	parent = sbinfo->si_dbgaufs;
+	if (!parent)
+		return;
+
+	bend = au_sbend(sb);
+	for (; bindex <= bend; bindex++) {
+		snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex);
+		br = au_sbr(sb, bindex);
+		xi = &br->br_xino;
+		AuDebugOn(xi->xi_dbgaufs);
+		xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent,
+						     sbinfo, &dbgaufs_xino_fop);
+		/* ignore an error */
+		if (unlikely(!xi->xi_dbgaufs))
+			AuWarn1("failed %s under debugfs\n", name);
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_EXPORT
+static int dbgaufs_xigen_open(struct inode *inode, struct file *file)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+	struct super_block *sb;
+
+	sbinfo = inode->i_private;
+	sb = sbinfo->si_sb;
+	si_noflush_read_lock(sb);
+	err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0);
+	si_read_unlock(sb);
+	return err;
+}
+
+static const struct file_operations dbgaufs_xigen_fop = {
+	.owner		= THIS_MODULE,
+	.open		= dbgaufs_xigen_open,
+	.release	= dbgaufs_xi_release,
+	.read		= dbgaufs_xi_read
+};
+
+static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo)
+{
+	int err;
+
+	/*
+	 * This function is a dynamic '__init' function actually,
+	 * so the tiny check for si_rwsem is unnecessary.
+	 */
+	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
+
+	err = -EIO;
+	sbinfo->si_dbgaufs_xigen = debugfs_create_file
+		("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
+		 &dbgaufs_xigen_fop);
+	if (sbinfo->si_dbgaufs_xigen)
+		err = 0;
+
+	return err;
+}
+#else
+static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo)
+{
+	return 0;
+}
+#endif /* CONFIG_AUFS_EXPORT */
+
+/* ---------------------------------------------------------------------- */
+
+void dbgaufs_si_fin(struct au_sbinfo *sbinfo)
+{
+	/*
+	 * This function is a dynamic '__fin' function actually,
+	 * so the tiny check for si_rwsem is unnecessary.
+	 */
+	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
+
+	debugfs_remove_recursive(sbinfo->si_dbgaufs);
+	sbinfo->si_dbgaufs = NULL;
+	kobject_put(&sbinfo->si_kobj);
+}
+
+int dbgaufs_si_init(struct au_sbinfo *sbinfo)
+{
+	int err;
+	char name[SysaufsSiNameLen];
+
+	/*
+	 * This function is a dynamic '__init' function actually,
+	 * so the tiny check for si_rwsem is unnecessary.
+	 */
+	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
+
+	err = -ENOENT;
+	if (!dbgaufs) {
+		AuErr1("/debug/aufs is uninitialized\n");
+		goto out;
+	}
+
+	err = -EIO;
+	sysaufs_name(sbinfo, name);
+	sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs);
+	if (unlikely(!sbinfo->si_dbgaufs))
+		goto out;
+	kobject_get(&sbinfo->si_kobj);
+
+	sbinfo->si_dbgaufs_xib = debugfs_create_file
+		("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
+		 &dbgaufs_xib_fop);
+	if (unlikely(!sbinfo->si_dbgaufs_xib))
+		goto out_dir;
+
+	sbinfo->si_dbgaufs_plink = debugfs_create_file
+		("plink", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
+		 &dbgaufs_plink_fop);
+	if (unlikely(!sbinfo->si_dbgaufs_plink))
+		goto out_dir;
+
+	err = dbgaufs_xigen_init(sbinfo);
+	if (!err)
+		goto out; /* success */
+
+out_dir:
+	dbgaufs_si_fin(sbinfo);
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void dbgaufs_fin(void)
+{
+	debugfs_remove(dbgaufs);
+}
+
+int __init dbgaufs_init(void)
+{
+	int err;
+
+	err = -EIO;
+	dbgaufs = debugfs_create_dir(AUFS_NAME, NULL);
+	if (dbgaufs)
+		err = 0;
+	return err;
+}
diff --git b/fs/aufs/dbgaufs.h b/fs/aufs/dbgaufs.h
new file mode 100644
index 0000000..81f272e
--- /dev/null
+++ b/fs/aufs/dbgaufs.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * debugfs interface
+ */
+
+#ifndef __DBGAUFS_H__
+#define __DBGAUFS_H__
+
+#ifdef __KERNEL__
+
+struct super_block;
+struct au_sbinfo;
+
+#ifdef CONFIG_DEBUG_FS
+/* dbgaufs.c */
+void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex);
+void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex);
+void dbgaufs_si_fin(struct au_sbinfo *sbinfo);
+int dbgaufs_si_init(struct au_sbinfo *sbinfo);
+void dbgaufs_fin(void);
+int __init dbgaufs_init(void);
+#else
+AuStubVoid(dbgaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex)
+AuStubVoid(dbgaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex)
+AuStubVoid(dbgaufs_si_fin, struct au_sbinfo *sbinfo)
+AuStubInt0(dbgaufs_si_init, struct au_sbinfo *sbinfo)
+AuStubVoid(dbgaufs_fin, void)
+AuStubInt0(__init dbgaufs_init, void)
+#endif /* CONFIG_DEBUG_FS */
+
+#endif /* __KERNEL__ */
+#endif /* __DBGAUFS_H__ */
diff --git b/fs/aufs/dcsub.c b/fs/aufs/dcsub.c
new file mode 100644
index 0000000..e72acce
--- /dev/null
+++ b/fs/aufs/dcsub.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sub-routines for dentry cache
+ */
+
+#include "aufs.h"
+
+static void au_dpage_free(struct au_dpage *dpage)
+{
+	int i;
+	struct dentry **p;
+
+	p = dpage->dentries;
+	for (i = 0; i < dpage->ndentry; i++)
+		dput(*p++);
+	free_page((unsigned long)dpage->dentries);
+}
+
+int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp)
+{
+	int err;
+	void *p;
+
+	err = -ENOMEM;
+	dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp);
+	if (unlikely(!dpages->dpages))
+		goto out;
+
+	p = (void *)__get_free_page(gfp);
+	if (unlikely(!p))
+		goto out_dpages;
+
+	dpages->dpages[0].ndentry = 0;
+	dpages->dpages[0].dentries = p;
+	dpages->ndpage = 1;
+	return 0; /* success */
+
+out_dpages:
+	kfree(dpages->dpages);
+out:
+	return err;
+}
+
+void au_dpages_free(struct au_dcsub_pages *dpages)
+{
+	int i;
+	struct au_dpage *p;
+
+	p = dpages->dpages;
+	for (i = 0; i < dpages->ndpage; i++)
+		au_dpage_free(p++);
+	kfree(dpages->dpages);
+}
+
+static int au_dpages_append(struct au_dcsub_pages *dpages,
+			    struct dentry *dentry, gfp_t gfp)
+{
+	int err, sz;
+	struct au_dpage *dpage;
+	void *p;
+
+	dpage = dpages->dpages + dpages->ndpage - 1;
+	sz = PAGE_SIZE / sizeof(dentry);
+	if (unlikely(dpage->ndentry >= sz)) {
+		AuLabel(new dpage);
+		err = -ENOMEM;
+		sz = dpages->ndpage * sizeof(*dpages->dpages);
+		p = au_kzrealloc(dpages->dpages, sz,
+				 sz + sizeof(*dpages->dpages), gfp);
+		if (unlikely(!p))
+			goto out;
+
+		dpages->dpages = p;
+		dpage = dpages->dpages + dpages->ndpage;
+		p = (void *)__get_free_page(gfp);
+		if (unlikely(!p))
+			goto out;
+
+		dpage->ndentry = 0;
+		dpage->dentries = p;
+		dpages->ndpage++;
+	}
+
+	AuDebugOn(au_dcount(dentry) <= 0);
+	dpage->dentries[dpage->ndentry++] = dget_dlock(dentry);
+	return 0; /* success */
+
+out:
+	return err;
+}
+
+/* todo: BAD approach */
+/* copied from linux/fs/dcache.c */
+enum d_walk_ret {
+	D_WALK_CONTINUE,
+	D_WALK_QUIT,
+	D_WALK_NORETRY,
+	D_WALK_SKIP,
+};
+
+extern void d_walk(struct dentry *parent, void *data,
+		   enum d_walk_ret (*enter)(void *, struct dentry *),
+		   void (*finish)(void *));
+
+struct ac_dpages_arg {
+	int err;
+	struct au_dcsub_pages *dpages;
+	struct super_block *sb;
+	au_dpages_test test;
+	void *arg;
+};
+
+static enum d_walk_ret au_call_dpages_append(void *_arg, struct dentry *dentry)
+{
+	enum d_walk_ret ret;
+	struct ac_dpages_arg *arg = _arg;
+
+	ret = D_WALK_CONTINUE;
+	if (dentry->d_sb == arg->sb
+	    && !IS_ROOT(dentry)
+	    && au_dcount(dentry) > 0
+	    && au_di(dentry)
+	    && (!arg->test || arg->test(dentry, arg->arg))) {
+		arg->err = au_dpages_append(arg->dpages, dentry, GFP_ATOMIC);
+		if (unlikely(arg->err))
+			ret = D_WALK_QUIT;
+	}
+
+	return ret;
+}
+
+int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root,
+		   au_dpages_test test, void *arg)
+{
+	struct ac_dpages_arg args = {
+		.err	= 0,
+		.dpages	= dpages,
+		.sb	= root->d_sb,
+		.test	= test,
+		.arg	= arg
+	};
+
+	d_walk(root, &args, au_call_dpages_append, NULL);
+
+	return args.err;
+}
+
+int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry,
+		       int do_include, au_dpages_test test, void *arg)
+{
+	int err;
+
+	err = 0;
+	write_seqlock(&rename_lock);
+	spin_lock(&dentry->d_lock);
+	if (do_include
+	    && au_dcount(dentry) > 0
+	    && (!test || test(dentry, arg)))
+		err = au_dpages_append(dpages, dentry, GFP_ATOMIC);
+	spin_unlock(&dentry->d_lock);
+	if (unlikely(err))
+		goto out;
+
+	/*
+	 * RCU for vfsmount is unnecessary since this is a traverse in a single
+	 * mount
+	 */
+	while (!IS_ROOT(dentry)) {
+		dentry = dentry->d_parent; /* rename_lock is locked */
+		spin_lock(&dentry->d_lock);
+		if (au_dcount(dentry) > 0
+		    && (!test || test(dentry, arg)))
+			err = au_dpages_append(dpages, dentry, GFP_ATOMIC);
+		spin_unlock(&dentry->d_lock);
+		if (unlikely(err))
+			break;
+	}
+
+out:
+	write_sequnlock(&rename_lock);
+	return err;
+}
+
+static inline int au_dcsub_dpages_aufs(struct dentry *dentry, void *arg)
+{
+	return au_di(dentry) && dentry->d_sb == arg;
+}
+
+int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages,
+			    struct dentry *dentry, int do_include)
+{
+	return au_dcsub_pages_rev(dpages, dentry, do_include,
+				  au_dcsub_dpages_aufs, dentry->d_sb);
+}
+
+int au_test_subdir(struct dentry *d1, struct dentry *d2)
+{
+	struct path path[2] = {
+		{
+			.dentry = d1
+		},
+		{
+			.dentry = d2
+		}
+	};
+
+	return path_is_under(path + 0, path + 1);
+}
diff --git b/fs/aufs/dcsub.h b/fs/aufs/dcsub.h
new file mode 100644
index 0000000..5d2cf66
--- /dev/null
+++ b/fs/aufs/dcsub.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sub-routines for dentry cache
+ */
+
+#ifndef __AUFS_DCSUB_H__
+#define __AUFS_DCSUB_H__
+
+#ifdef __KERNEL__
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+
+struct au_dpage {
+	int ndentry;
+	struct dentry **dentries;
+};
+
+struct au_dcsub_pages {
+	int ndpage;
+	struct au_dpage *dpages;
+};
+
+/* ---------------------------------------------------------------------- */
+
+/* dcsub.c */
+int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp);
+void au_dpages_free(struct au_dcsub_pages *dpages);
+typedef int (*au_dpages_test)(struct dentry *dentry, void *arg);
+int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root,
+		   au_dpages_test test, void *arg);
+int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry,
+		       int do_include, au_dpages_test test, void *arg);
+int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages,
+			    struct dentry *dentry, int do_include);
+int au_test_subdir(struct dentry *d1, struct dentry *d2);
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * todo: in linux-3.13, several similar (but faster) helpers are added to
+ * include/linux/dcache.h. Try them (in the future).
+ */
+
+static inline int au_d_hashed_positive(struct dentry *d)
+{
+	int err;
+	struct inode *inode = d_inode(d);
+
+	err = 0;
+	if (unlikely(d_unhashed(d)
+		     || d_is_negative(d)
+		     || !inode->i_nlink))
+		err = -ENOENT;
+	return err;
+}
+
+static inline int au_d_linkable(struct dentry *d)
+{
+	int err;
+	struct inode *inode = d_inode(d);
+
+	err = au_d_hashed_positive(d);
+	if (err
+	    && d_is_positive(d)
+	    && (inode->i_state & I_LINKABLE))
+		err = 0;
+	return err;
+}
+
+static inline int au_d_alive(struct dentry *d)
+{
+	int err;
+	struct inode *inode;
+
+	err = 0;
+	if (!IS_ROOT(d))
+		err = au_d_hashed_positive(d);
+	else {
+		inode = d_inode(d);
+		if (unlikely(d_unlinked(d)
+			     || d_is_negative(d)
+			     || !inode->i_nlink))
+			err = -ENOENT;
+	}
+	return err;
+}
+
+static inline int au_alive_dir(struct dentry *d)
+{
+	int err;
+
+	err = au_d_alive(d);
+	if (unlikely(err || IS_DEADDIR(d_inode(d))))
+		err = -ENOENT;
+	return err;
+}
+
+static inline int au_qstreq(struct qstr *a, struct qstr *b)
+{
+	return a->len == b->len
+		&& !memcmp(a->name, b->name, a->len);
+}
+
+/*
+ * by the commit
+ * 360f547 2015-01-25 dcache: let the dentry count go down to zero without
+ *			taking d_lock
+ * the type of d_lockref.count became int, but the inlined function d_count()
+ * still returns unsigned int.
+ * I don't know why. Maybe it is for every d_count() users?
+ * Anyway au_dcount() lives on.
+ */
+static inline int au_dcount(struct dentry *d)
+{
+	return (int)d_count(d);
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_DCSUB_H__ */
diff --git b/fs/aufs/debug.c b/fs/aufs/debug.c
new file mode 100644
index 0000000..4529831
--- /dev/null
+++ b/fs/aufs/debug.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * debug print functions
+ */
+
+#include "aufs.h"
+
+/* Returns 0, or -errno.  arg is in kp->arg. */
+static int param_atomic_t_set(const char *val, const struct kernel_param *kp)
+{
+	int err, n;
+
+	err = kstrtoint(val, 0, &n);
+	if (!err) {
+		if (n > 0)
+			au_debug_on();
+		else
+			au_debug_off();
+	}
+	return err;
+}
+
+/* Returns length written or -errno.  Buffer is 4k (ie. be short!) */
+static int param_atomic_t_get(char *buffer, const struct kernel_param *kp)
+{
+	atomic_t *a;
+
+	a = kp->arg;
+	return sprintf(buffer, "%d", atomic_read(a));
+}
+
+static struct kernel_param_ops param_ops_atomic_t = {
+	.set = param_atomic_t_set,
+	.get = param_atomic_t_get
+	/* void (*free)(void *arg) */
+};
+
+atomic_t aufs_debug = ATOMIC_INIT(0);
+MODULE_PARM_DESC(debug, "debug print");
+module_param_named(debug, aufs_debug, atomic_t, S_IRUGO | S_IWUSR | S_IWGRP);
+
+DEFINE_MUTEX(au_dbg_mtx);	/* just to serialize the dbg msgs */
+char *au_plevel = KERN_DEBUG;
+#define dpri(fmt, ...) do {					\
+	if ((au_plevel						\
+	     && strcmp(au_plevel, KERN_DEBUG))			\
+	    || au_debug_test())					\
+		printk("%s" fmt, au_plevel, ##__VA_ARGS__);	\
+} while (0)
+
+/* ---------------------------------------------------------------------- */
+
+void au_dpri_whlist(struct au_nhash *whlist)
+{
+	unsigned long ul, n;
+	struct hlist_head *head;
+	struct au_vdir_wh *pos;
+
+	n = whlist->nh_num;
+	head = whlist->nh_head;
+	for (ul = 0; ul < n; ul++) {
+		hlist_for_each_entry(pos, head, wh_hash)
+			dpri("b%d, %.*s, %d\n",
+			     pos->wh_bindex,
+			     pos->wh_str.len, pos->wh_str.name,
+			     pos->wh_str.len);
+		head++;
+	}
+}
+
+void au_dpri_vdir(struct au_vdir *vdir)
+{
+	unsigned long ul;
+	union au_vdir_deblk_p p;
+	unsigned char *o;
+
+	if (!vdir || IS_ERR(vdir)) {
+		dpri("err %ld\n", PTR_ERR(vdir));
+		return;
+	}
+
+	dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n",
+	     vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk,
+	     vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version);
+	for (ul = 0; ul < vdir->vd_nblk; ul++) {
+		p.deblk = vdir->vd_deblk[ul];
+		o = p.deblk;
+		dpri("[%lu]: %p\n", ul, o);
+	}
+}
+
+static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn,
+			struct dentry *wh)
+{
+	char *n = NULL;
+	int l = 0;
+
+	if (!inode || IS_ERR(inode)) {
+		dpri("i%d: err %ld\n", bindex, PTR_ERR(inode));
+		return -1;
+	}
+
+	/* the type of i_blocks depends upon CONFIG_LBDAF */
+	BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long)
+		     && sizeof(inode->i_blocks) != sizeof(u64));
+	if (wh) {
+		n = (void *)wh->d_name.name;
+		l = wh->d_name.len;
+	}
+
+	dpri("i%d: %p, i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu,"
+	     " hn %d, ct %lld, np %lu, st 0x%lx, f 0x%x, v %llu, g %x%s%.*s\n",
+	     bindex, inode,
+	     inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??",
+	     atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode,
+	     i_size_read(inode), (unsigned long long)inode->i_blocks,
+	     hn, (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff,
+	     inode->i_mapping ? inode->i_mapping->nrpages : 0,
+	     inode->i_state, inode->i_flags, inode->i_version,
+	     inode->i_generation,
+	     l ? ", wh " : "", l, n);
+	return 0;
+}
+
+void au_dpri_inode(struct inode *inode)
+{
+	struct au_iinfo *iinfo;
+	aufs_bindex_t bindex;
+	int err, hn;
+
+	err = do_pri_inode(-1, inode, -1, NULL);
+	if (err || !au_test_aufs(inode->i_sb))
+		return;
+
+	iinfo = au_ii(inode);
+	if (!iinfo)
+		return;
+	dpri("i-1: bstart %d, bend %d, gen %d\n",
+	     iinfo->ii_bstart, iinfo->ii_bend, au_iigen(inode, NULL));
+	if (iinfo->ii_bstart < 0)
+		return;
+	hn = 0;
+	for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; bindex++) {
+		hn = !!au_hn(iinfo->ii_hinode + bindex);
+		do_pri_inode(bindex, iinfo->ii_hinode[0 + bindex].hi_inode, hn,
+			     iinfo->ii_hinode[0 + bindex].hi_whdentry);
+	}
+}
+
+void au_dpri_dalias(struct inode *inode)
+{
+	struct dentry *d;
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias)
+		au_dpri_dentry(d);
+	spin_unlock(&inode->i_lock);
+}
+
+static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry)
+{
+	struct dentry *wh = NULL;
+	int hn;
+	struct au_iinfo *iinfo;
+
+	if (!dentry || IS_ERR(dentry)) {
+		dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry));
+		return -1;
+	}
+	/* do not call dget_parent() here */
+	/* note: access d_xxx without d_lock */
+	dpri("d%d: %p, %pd2?, %s, cnt %d, flags 0x%x, %shashed\n",
+	     bindex, dentry, dentry,
+	     dentry->d_sb ? au_sbtype(dentry->d_sb) : "??",
+	     au_dcount(dentry), dentry->d_flags,
+	     d_unhashed(dentry) ? "un" : "");
+	hn = -1;
+	if (bindex >= 0
+	    && d_is_positive(dentry)
+	    && au_test_aufs(dentry->d_sb)) {
+		iinfo = au_ii(d_inode(dentry));
+		if (iinfo) {
+			hn = !!au_hn(iinfo->ii_hinode + bindex);
+			wh = iinfo->ii_hinode[0 + bindex].hi_whdentry;
+		}
+	}
+	do_pri_inode(bindex, d_inode(dentry), hn, wh);
+	return 0;
+}
+
+void au_dpri_dentry(struct dentry *dentry)
+{
+	struct au_dinfo *dinfo;
+	aufs_bindex_t bindex;
+	int err;
+	struct au_hdentry *hdp;
+
+	err = do_pri_dentry(-1, dentry);
+	if (err || !au_test_aufs(dentry->d_sb))
+		return;
+
+	dinfo = au_di(dentry);
+	if (!dinfo)
+		return;
+	dpri("d-1: bstart %d, bend %d, bwh %d, bdiropq %d, gen %d, tmp %d\n",
+	     dinfo->di_bstart, dinfo->di_bend,
+	     dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry),
+	     dinfo->di_tmpfile);
+	if (dinfo->di_bstart < 0)
+		return;
+	hdp = dinfo->di_hdentry;
+	for (bindex = dinfo->di_bstart; bindex <= dinfo->di_bend; bindex++)
+		do_pri_dentry(bindex, hdp[0 + bindex].hd_dentry);
+}
+
+static int do_pri_file(aufs_bindex_t bindex, struct file *file)
+{
+	char a[32];
+
+	if (!file || IS_ERR(file)) {
+		dpri("f%d: err %ld\n", bindex, PTR_ERR(file));
+		return -1;
+	}
+	a[0] = 0;
+	if (bindex < 0
+	    && !IS_ERR_OR_NULL(file->f_path.dentry)
+	    && au_test_aufs(file->f_path.dentry->d_sb)
+	    && au_fi(file))
+		snprintf(a, sizeof(a), ", gen %d, mmapped %d",
+			 au_figen(file), atomic_read(&au_fi(file)->fi_mmapped));
+	dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n",
+	     bindex, file->f_mode, file->f_flags, (long)file_count(file),
+	     file->f_version, file->f_pos, a);
+	if (!IS_ERR_OR_NULL(file->f_path.dentry))
+		do_pri_dentry(bindex, file->f_path.dentry);
+	return 0;
+}
+
+void au_dpri_file(struct file *file)
+{
+	struct au_finfo *finfo;
+	struct au_fidir *fidir;
+	struct au_hfile *hfile;
+	aufs_bindex_t bindex;
+	int err;
+
+	err = do_pri_file(-1, file);
+	if (err
+	    || IS_ERR_OR_NULL(file->f_path.dentry)
+	    || !au_test_aufs(file->f_path.dentry->d_sb))
+		return;
+
+	finfo = au_fi(file);
+	if (!finfo)
+		return;
+	if (finfo->fi_btop < 0)
+		return;
+	fidir = finfo->fi_hdir;
+	if (!fidir)
+		do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file);
+	else
+		for (bindex = finfo->fi_btop;
+		     bindex >= 0 && bindex <= fidir->fd_bbot;
+		     bindex++) {
+			hfile = fidir->fd_hfile + bindex;
+			do_pri_file(bindex, hfile ? hfile->hf_file : NULL);
+		}
+}
+
+static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br)
+{
+	struct vfsmount *mnt;
+	struct super_block *sb;
+
+	if (!br || IS_ERR(br))
+		goto out;
+	mnt = au_br_mnt(br);
+	if (!mnt || IS_ERR(mnt))
+		goto out;
+	sb = mnt->mnt_sb;
+	if (!sb || IS_ERR(sb))
+		goto out;
+
+	dpri("s%d: {perm 0x%x, id %d, cnt %d, wbr %p}, "
+	     "%s, dev 0x%02x%02x, flags 0x%lx, cnt %d, active %d, "
+	     "xino %d\n",
+	     bindex, br->br_perm, br->br_id, atomic_read(&br->br_count),
+	     br->br_wbr, au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev),
+	     sb->s_flags, sb->s_count,
+	     atomic_read(&sb->s_active), !!br->br_xino.xi_file);
+	return 0;
+
+out:
+	dpri("s%d: err %ld\n", bindex, PTR_ERR(br));
+	return -1;
+}
+
+void au_dpri_sb(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+	aufs_bindex_t bindex;
+	int err;
+	/* to reuduce stack size */
+	struct {
+		struct vfsmount mnt;
+		struct au_branch fake;
+	} *a;
+
+	/* this function can be called from magic sysrq */
+	a = kzalloc(sizeof(*a), GFP_ATOMIC);
+	if (unlikely(!a)) {
+		dpri("no memory\n");
+		return;
+	}
+
+	a->mnt.mnt_sb = sb;
+	a->fake.br_path.mnt = &a->mnt;
+	atomic_set(&a->fake.br_count, 0);
+	smp_mb(); /* atomic_set */
+	err = do_pri_br(-1, &a->fake);
+	kfree(a);
+	dpri("dev 0x%x\n", sb->s_dev);
+	if (err || !au_test_aufs(sb))
+		return;
+
+	sbinfo = au_sbi(sb);
+	if (!sbinfo)
+		return;
+	dpri("nw %d, gen %u, kobj %d\n",
+	     atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation,
+	     atomic_read(&sbinfo->si_kobj.kref.refcount));
+	for (bindex = 0; bindex <= sbinfo->si_bend; bindex++)
+		do_pri_br(bindex, sbinfo->si_branch[0 + bindex]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line)
+{
+	struct inode *h_inode, *inode = d_inode(dentry);
+	struct dentry *h_dentry;
+	aufs_bindex_t bindex, bend, bi;
+
+	if (!inode /* || au_di(dentry)->di_lsc == AuLsc_DI_TMP */)
+		return;
+
+	bend = au_dbend(dentry);
+	bi = au_ibend(inode);
+	if (bi < bend)
+		bend = bi;
+	bindex = au_dbstart(dentry);
+	bi = au_ibstart(inode);
+	if (bi > bindex)
+		bindex = bi;
+
+	for (; bindex <= bend; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+		h_inode = au_h_iptr(inode, bindex);
+		if (unlikely(h_inode != d_inode(h_dentry))) {
+			au_debug_on();
+			AuDbg("b%d, %s:%d\n", bindex, func, line);
+			AuDbgDentry(dentry);
+			AuDbgInode(inode);
+			au_debug_off();
+			BUG();
+		}
+	}
+}
+
+void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen)
+{
+	int err, i, j;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry **dentries;
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	AuDebugOn(err);
+	err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/1);
+	AuDebugOn(err);
+	for (i = dpages.ndpage - 1; !err && i >= 0; i--) {
+		dpage = dpages.dpages + i;
+		dentries = dpage->dentries;
+		for (j = dpage->ndentry - 1; !err && j >= 0; j--)
+			AuDebugOn(au_digen_test(dentries[j], sigen));
+	}
+	au_dpages_free(&dpages);
+}
+
+void au_dbg_verify_kthread(void)
+{
+	if (au_wkq_test()) {
+		au_dbg_blocked();
+		/*
+		 * It may be recursive, but udba=notify between two aufs mounts,
+		 * where a single ro branch is shared, is not a problem.
+		 */
+		/* WARN_ON(1); */
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+int __init au_debug_init(void)
+{
+	aufs_bindex_t bindex;
+	struct au_vdir_destr destr;
+
+	bindex = -1;
+	AuDebugOn(bindex >= 0);
+
+	destr.len = -1;
+	AuDebugOn(destr.len < NAME_MAX);
+
+#ifdef CONFIG_4KSTACKS
+	pr_warn("CONFIG_4KSTACKS is defined.\n");
+#endif
+
+	return 0;
+}
diff --git b/fs/aufs/debug.h b/fs/aufs/debug.h
new file mode 100644
index 0000000..0567f31
--- /dev/null
+++ b/fs/aufs/debug.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * debug print functions
+ */
+
+#ifndef __AUFS_DEBUG_H__
+#define __AUFS_DEBUG_H__
+
+#ifdef __KERNEL__
+
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/sysrq.h>
+
+#ifdef CONFIG_AUFS_DEBUG
+#define AuDebugOn(a)		BUG_ON(a)
+
+/* module parameter */
+extern atomic_t aufs_debug;
+static inline void au_debug_on(void)
+{
+	atomic_inc(&aufs_debug);
+}
+static inline void au_debug_off(void)
+{
+	atomic_dec_if_positive(&aufs_debug);
+}
+
+static inline int au_debug_test(void)
+{
+	return atomic_read(&aufs_debug) > 0;
+}
+#else
+#define AuDebugOn(a)		do {} while (0)
+AuStubVoid(au_debug_on, void)
+AuStubVoid(au_debug_off, void)
+AuStubInt0(au_debug_test, void)
+#endif /* CONFIG_AUFS_DEBUG */
+
+#define param_check_atomic_t(name, p) __param_check(name, p, atomic_t)
+
+/* ---------------------------------------------------------------------- */
+
+/* debug print */
+
+#define AuDbg(fmt, ...) do { \
+	if (au_debug_test()) \
+		pr_debug("DEBUG: " fmt, ##__VA_ARGS__); \
+} while (0)
+#define AuLabel(l)		AuDbg(#l "\n")
+#define AuIOErr(fmt, ...)	pr_err("I/O Error, " fmt, ##__VA_ARGS__)
+#define AuWarn1(fmt, ...) do { \
+	static unsigned char _c; \
+	if (!_c++) \
+		pr_warn(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define AuErr1(fmt, ...) do { \
+	static unsigned char _c; \
+	if (!_c++) \
+		pr_err(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define AuIOErr1(fmt, ...) do { \
+	static unsigned char _c; \
+	if (!_c++) \
+		AuIOErr(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define AuUnsupportMsg	"This operation is not supported." \
+			" Please report this application to aufs-users ML."
+#define AuUnsupport(fmt, ...) do { \
+	pr_err(AuUnsupportMsg "\n" fmt, ##__VA_ARGS__); \
+	dump_stack(); \
+} while (0)
+
+#define AuTraceErr(e) do { \
+	if (unlikely((e) < 0)) \
+		AuDbg("err %d\n", (int)(e)); \
+} while (0)
+
+#define AuTraceErrPtr(p) do { \
+	if (IS_ERR(p)) \
+		AuDbg("err %ld\n", PTR_ERR(p)); \
+} while (0)
+
+/* dirty macros for debug print, use with "%.*s" and caution */
+#define AuLNPair(qstr)		(qstr)->len, (qstr)->name
+
+/* ---------------------------------------------------------------------- */
+
+struct dentry;
+#ifdef CONFIG_AUFS_DEBUG
+extern struct mutex au_dbg_mtx;
+extern char *au_plevel;
+struct au_nhash;
+void au_dpri_whlist(struct au_nhash *whlist);
+struct au_vdir;
+void au_dpri_vdir(struct au_vdir *vdir);
+struct inode;
+void au_dpri_inode(struct inode *inode);
+void au_dpri_dalias(struct inode *inode);
+void au_dpri_dentry(struct dentry *dentry);
+struct file;
+void au_dpri_file(struct file *filp);
+struct super_block;
+void au_dpri_sb(struct super_block *sb);
+
+#define au_dbg_verify_dinode(d) __au_dbg_verify_dinode(d, __func__, __LINE__)
+void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line);
+void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen);
+void au_dbg_verify_kthread(void);
+
+int __init au_debug_init(void);
+
+#define AuDbgWhlist(w) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#w "\n"); \
+	au_dpri_whlist(w); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgVdir(v) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#v "\n"); \
+	au_dpri_vdir(v); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgInode(i) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#i "\n"); \
+	au_dpri_inode(i); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgDAlias(i) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#i "\n"); \
+	au_dpri_dalias(i); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgDentry(d) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#d "\n"); \
+	au_dpri_dentry(d); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgFile(f) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#f "\n"); \
+	au_dpri_file(f); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgSb(sb) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#sb "\n"); \
+	au_dpri_sb(sb); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgSym(addr) do {				\
+	char sym[KSYM_SYMBOL_LEN];			\
+	sprint_symbol(sym, (unsigned long)addr);	\
+	AuDbg("%s\n", sym);				\
+} while (0)
+#else
+AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry)
+AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen)
+AuStubVoid(au_dbg_verify_kthread, void)
+AuStubInt0(__init au_debug_init, void)
+
+#define AuDbgWhlist(w)		do {} while (0)
+#define AuDbgVdir(v)		do {} while (0)
+#define AuDbgInode(i)		do {} while (0)
+#define AuDbgDAlias(i)		do {} while (0)
+#define AuDbgDentry(d)		do {} while (0)
+#define AuDbgFile(f)		do {} while (0)
+#define AuDbgSb(sb)		do {} while (0)
+#define AuDbgSym(addr)		do {} while (0)
+#endif /* CONFIG_AUFS_DEBUG */
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_MAGIC_SYSRQ
+int __init au_sysrq_init(void);
+void au_sysrq_fin(void);
+
+#ifdef CONFIG_HW_CONSOLE
+#define au_dbg_blocked() do { \
+	WARN_ON(1); \
+	handle_sysrq('w'); \
+} while (0)
+#else
+AuStubVoid(au_dbg_blocked, void)
+#endif
+
+#else
+AuStubInt0(__init au_sysrq_init, void)
+AuStubVoid(au_sysrq_fin, void)
+AuStubVoid(au_dbg_blocked, void)
+#endif /* CONFIG_AUFS_MAGIC_SYSRQ */
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_DEBUG_H__ */
diff --git b/fs/aufs/dentry.c b/fs/aufs/dentry.c
new file mode 100644
index 0000000..e65fadd
--- /dev/null
+++ b/fs/aufs/dentry.c
@@ -0,0 +1,1123 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * lookup and dentry operations
+ */
+
+#include <linux/namei.h>
+#include "aufs.h"
+
+#define AuLkup_ALLOW_NEG	1
+#define AuLkup_IGNORE_PERM	(1 << 1)
+#define au_ftest_lkup(flags, name)	((flags) & AuLkup_##name)
+#define au_fset_lkup(flags, name) \
+	do { (flags) |= AuLkup_##name; } while (0)
+#define au_fclr_lkup(flags, name) \
+	do { (flags) &= ~AuLkup_##name; } while (0)
+
+struct au_do_lookup_args {
+	unsigned int		flags;
+	mode_t			type;
+};
+
+/*
+ * returns positive/negative dentry, NULL or an error.
+ * NULL means whiteout-ed or not-found.
+ */
+static struct dentry*
+au_do_lookup(struct dentry *h_parent, struct dentry *dentry,
+	     aufs_bindex_t bindex, struct qstr *wh_name,
+	     struct au_do_lookup_args *args)
+{
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+	struct au_branch *br;
+	int wh_found, opq;
+	unsigned char wh_able;
+	const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG);
+	const unsigned char ignore_perm = !!au_ftest_lkup(args->flags,
+							  IGNORE_PERM);
+
+	wh_found = 0;
+	br = au_sbr(dentry->d_sb, bindex);
+	wh_able = !!au_br_whable(br->br_perm);
+	if (wh_able)
+		wh_found = au_wh_test(h_parent, wh_name, /*try_sio*/0);
+	h_dentry = ERR_PTR(wh_found);
+	if (!wh_found)
+		goto real_lookup;
+	if (unlikely(wh_found < 0))
+		goto out;
+
+	/* We found a whiteout */
+	/* au_set_dbend(dentry, bindex); */
+	au_set_dbwh(dentry, bindex);
+	if (!allow_neg)
+		return NULL; /* success */
+
+real_lookup:
+	if (!ignore_perm)
+		h_dentry = vfsub_lkup_one(&dentry->d_name, h_parent);
+	else
+		h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent);
+	if (IS_ERR(h_dentry)) {
+		if (PTR_ERR(h_dentry) == -ENAMETOOLONG
+		    && !allow_neg)
+			h_dentry = NULL;
+		goto out;
+	}
+
+	h_inode = d_inode(h_dentry);
+	if (d_is_negative(h_dentry)) {
+		if (!allow_neg)
+			goto out_neg;
+	} else if (wh_found
+		   || (args->type && args->type != (h_inode->i_mode & S_IFMT)))
+		goto out_neg;
+
+	if (au_dbend(dentry) <= bindex)
+		au_set_dbend(dentry, bindex);
+	if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry))
+		au_set_dbstart(dentry, bindex);
+	au_set_h_dptr(dentry, bindex, h_dentry);
+
+	if (!d_is_dir(h_dentry)
+	    || !wh_able
+	    || (d_really_is_positive(dentry) && !d_is_dir(dentry)))
+		goto out; /* success */
+
+	inode_lock_nested(h_inode, AuLsc_I_CHILD);
+	opq = au_diropq_test(h_dentry);
+	inode_unlock(h_inode);
+	if (opq > 0)
+		au_set_dbdiropq(dentry, bindex);
+	else if (unlikely(opq < 0)) {
+		au_set_h_dptr(dentry, bindex, NULL);
+		h_dentry = ERR_PTR(opq);
+	}
+	goto out;
+
+out_neg:
+	dput(h_dentry);
+	h_dentry = NULL;
+out:
+	return h_dentry;
+}
+
+static int au_test_shwh(struct super_block *sb, const struct qstr *name)
+{
+	if (unlikely(!au_opt_test(au_mntflags(sb), SHWH)
+		     && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)))
+		return -EPERM;
+	return 0;
+}
+
+/*
+ * returns the number of lower positive dentries,
+ * otherwise an error.
+ * can be called at unlinking with @type is zero.
+ */
+int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type)
+{
+	int npositive, err;
+	aufs_bindex_t bindex, btail, bdiropq;
+	unsigned char isdir, dirperm1;
+	struct qstr whname;
+	struct au_do_lookup_args args = {
+		.flags		= 0,
+		.type		= type
+	};
+	const struct qstr *name = &dentry->d_name;
+	struct dentry *parent;
+	struct super_block *sb;
+
+	sb = dentry->d_sb;
+	err = au_test_shwh(sb, name);
+	if (unlikely(err))
+		goto out;
+
+	err = au_wh_name_alloc(&whname, name);
+	if (unlikely(err))
+		goto out;
+
+	isdir = !!d_is_dir(dentry);
+	if (!type)
+		au_fset_lkup(args.flags, ALLOW_NEG);
+	dirperm1 = !!au_opt_test(au_mntflags(sb), DIRPERM1);
+
+	npositive = 0;
+	parent = dget_parent(dentry);
+	btail = au_dbtaildir(parent);
+	for (bindex = bstart; bindex <= btail; bindex++) {
+		struct dentry *h_parent, *h_dentry;
+		struct inode *h_inode, *h_dir;
+
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (h_dentry) {
+			if (d_is_positive(h_dentry))
+				npositive++;
+			if (type != S_IFDIR)
+				break;
+			continue;
+		}
+		h_parent = au_h_dptr(parent, bindex);
+		if (!h_parent || !d_is_dir(h_parent))
+			continue;
+
+		h_dir = d_inode(h_parent);
+		inode_lock_nested(h_dir, AuLsc_I_PARENT);
+		h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname,
+					&args);
+		inode_unlock(h_dir);
+		err = PTR_ERR(h_dentry);
+		if (IS_ERR(h_dentry))
+			goto out_parent;
+		if (h_dentry)
+			au_fclr_lkup(args.flags, ALLOW_NEG);
+		if (dirperm1)
+			au_fset_lkup(args.flags, IGNORE_PERM);
+
+		if (au_dbwh(dentry) == bindex)
+			break;
+		if (!h_dentry)
+			continue;
+		if (d_is_negative(h_dentry))
+			continue;
+		h_inode = d_inode(h_dentry);
+		npositive++;
+		if (!args.type)
+			args.type = h_inode->i_mode & S_IFMT;
+		if (args.type != S_IFDIR)
+			break;
+		else if (isdir) {
+			/* the type of lower may be different */
+			bdiropq = au_dbdiropq(dentry);
+			if (bdiropq >= 0 && bdiropq <= bindex)
+				break;
+		}
+	}
+
+	if (npositive) {
+		AuLabel(positive);
+		au_update_dbstart(dentry);
+	}
+	err = npositive;
+	if (unlikely(!au_opt_test(au_mntflags(sb), UDBA_NONE)
+		     && au_dbstart(dentry) < 0)) {
+		err = -EIO;
+		AuIOErr("both of real entry and whiteout found, %pd, err %d\n",
+			dentry, err);
+	}
+
+out_parent:
+	dput(parent);
+	kfree(whname.name);
+out:
+	return err;
+}
+
+struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent)
+{
+	struct dentry *dentry;
+	int wkq_err;
+
+	if (!au_test_h_perm_sio(d_inode(parent), MAY_EXEC))
+		dentry = vfsub_lkup_one(name, parent);
+	else {
+		struct vfsub_lkup_one_args args = {
+			.errp	= &dentry,
+			.name	= name,
+			.parent	= parent
+		};
+
+		wkq_err = au_wkq_wait(vfsub_call_lkup_one, &args);
+		if (unlikely(wkq_err))
+			dentry = ERR_PTR(wkq_err);
+	}
+
+	return dentry;
+}
+
+/*
+ * lookup @dentry on @bindex which should be negative.
+ */
+int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh)
+{
+	int err;
+	struct dentry *parent, *h_parent, *h_dentry;
+	struct au_branch *br;
+
+	parent = dget_parent(dentry);
+	h_parent = au_h_dptr(parent, bindex);
+	br = au_sbr(dentry->d_sb, bindex);
+	if (wh)
+		h_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name);
+	else
+		h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent);
+	err = PTR_ERR(h_dentry);
+	if (IS_ERR(h_dentry))
+		goto out;
+	if (unlikely(d_is_positive(h_dentry))) {
+		err = -EIO;
+		AuIOErr("%pd should be negative on b%d.\n", h_dentry, bindex);
+		dput(h_dentry);
+		goto out;
+	}
+
+	err = 0;
+	if (bindex < au_dbstart(dentry))
+		au_set_dbstart(dentry, bindex);
+	if (au_dbend(dentry) < bindex)
+		au_set_dbend(dentry, bindex);
+	au_set_h_dptr(dentry, bindex, h_dentry);
+
+out:
+	dput(parent);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* subset of struct inode */
+struct au_iattr {
+	unsigned long		i_ino;
+	/* unsigned int		i_nlink; */
+	kuid_t			i_uid;
+	kgid_t			i_gid;
+	u64			i_version;
+/*
+	loff_t			i_size;
+	blkcnt_t		i_blocks;
+*/
+	umode_t			i_mode;
+};
+
+static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode)
+{
+	ia->i_ino = h_inode->i_ino;
+	/* ia->i_nlink = h_inode->i_nlink; */
+	ia->i_uid = h_inode->i_uid;
+	ia->i_gid = h_inode->i_gid;
+	ia->i_version = h_inode->i_version;
+/*
+	ia->i_size = h_inode->i_size;
+	ia->i_blocks = h_inode->i_blocks;
+*/
+	ia->i_mode = (h_inode->i_mode & S_IFMT);
+}
+
+static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode)
+{
+	return ia->i_ino != h_inode->i_ino
+		/* || ia->i_nlink != h_inode->i_nlink */
+		|| !uid_eq(ia->i_uid, h_inode->i_uid)
+		|| !gid_eq(ia->i_gid, h_inode->i_gid)
+		|| ia->i_version != h_inode->i_version
+/*
+		|| ia->i_size != h_inode->i_size
+		|| ia->i_blocks != h_inode->i_blocks
+*/
+		|| ia->i_mode != (h_inode->i_mode & S_IFMT);
+}
+
+static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent,
+			      struct au_branch *br)
+{
+	int err;
+	struct au_iattr ia;
+	struct inode *h_inode;
+	struct dentry *h_d;
+	struct super_block *h_sb;
+
+	err = 0;
+	memset(&ia, -1, sizeof(ia));
+	h_sb = h_dentry->d_sb;
+	h_inode = NULL;
+	if (d_is_positive(h_dentry)) {
+		h_inode = d_inode(h_dentry);
+		au_iattr_save(&ia, h_inode);
+	} else if (au_test_nfs(h_sb) || au_test_fuse(h_sb))
+		/* nfs d_revalidate may return 0 for negative dentry */
+		/* fuse d_revalidate always return 0 for negative dentry */
+		goto out;
+
+	/* main purpose is namei.c:cached_lookup() and d_revalidate */
+	h_d = vfsub_lkup_one(&h_dentry->d_name, h_parent);
+	err = PTR_ERR(h_d);
+	if (IS_ERR(h_d))
+		goto out;
+
+	err = 0;
+	if (unlikely(h_d != h_dentry
+		     || d_inode(h_d) != h_inode
+		     || (h_inode && au_iattr_test(&ia, h_inode))))
+		err = au_busy_or_stale();
+	dput(h_d);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir,
+		struct dentry *h_parent, struct au_branch *br)
+{
+	int err;
+
+	err = 0;
+	if (udba == AuOpt_UDBA_REVAL
+	    && !au_test_fs_remote(h_dentry->d_sb)) {
+		IMustLock(h_dir);
+		err = (d_inode(h_dentry->d_parent) != h_dir);
+	} else if (udba != AuOpt_UDBA_NONE)
+		err = au_h_verify_dentry(h_dentry, h_parent, br);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_do_refresh_hdentry(struct dentry *dentry, struct dentry *parent)
+{
+	int err;
+	aufs_bindex_t new_bindex, bindex, bend, bwh, bdiropq;
+	struct au_hdentry tmp, *p, *q;
+	struct au_dinfo *dinfo;
+	struct super_block *sb;
+
+	DiMustWriteLock(dentry);
+
+	sb = dentry->d_sb;
+	dinfo = au_di(dentry);
+	bend = dinfo->di_bend;
+	bwh = dinfo->di_bwh;
+	bdiropq = dinfo->di_bdiropq;
+	p = dinfo->di_hdentry + dinfo->di_bstart;
+	for (bindex = dinfo->di_bstart; bindex <= bend; bindex++, p++) {
+		if (!p->hd_dentry)
+			continue;
+
+		new_bindex = au_br_index(sb, p->hd_id);
+		if (new_bindex == bindex)
+			continue;
+
+		if (dinfo->di_bwh == bindex)
+			bwh = new_bindex;
+		if (dinfo->di_bdiropq == bindex)
+			bdiropq = new_bindex;
+		if (new_bindex < 0) {
+			au_hdput(p);
+			p->hd_dentry = NULL;
+			continue;
+		}
+
+		/* swap two lower dentries, and loop again */
+		q = dinfo->di_hdentry + new_bindex;
+		tmp = *q;
+		*q = *p;
+		*p = tmp;
+		if (tmp.hd_dentry) {
+			bindex--;
+			p--;
+		}
+	}
+
+	dinfo->di_bwh = -1;
+	if (bwh >= 0 && bwh <= au_sbend(sb) && au_sbr_whable(sb, bwh))
+		dinfo->di_bwh = bwh;
+
+	dinfo->di_bdiropq = -1;
+	if (bdiropq >= 0
+	    && bdiropq <= au_sbend(sb)
+	    && au_sbr_whable(sb, bdiropq))
+		dinfo->di_bdiropq = bdiropq;
+
+	err = -EIO;
+	dinfo->di_bstart = -1;
+	dinfo->di_bend = -1;
+	bend = au_dbend(parent);
+	p = dinfo->di_hdentry;
+	for (bindex = 0; bindex <= bend; bindex++, p++)
+		if (p->hd_dentry) {
+			dinfo->di_bstart = bindex;
+			break;
+		}
+
+	if (dinfo->di_bstart >= 0) {
+		p = dinfo->di_hdentry + bend;
+		for (bindex = bend; bindex >= 0; bindex--, p--)
+			if (p->hd_dentry) {
+				dinfo->di_bend = bindex;
+				err = 0;
+				break;
+			}
+	}
+
+	return err;
+}
+
+static void au_do_hide(struct dentry *dentry)
+{
+	struct inode *inode;
+
+	if (d_really_is_positive(dentry)) {
+		inode = d_inode(dentry);
+		if (!d_is_dir(dentry)) {
+			if (inode->i_nlink && !d_unhashed(dentry))
+				drop_nlink(inode);
+		} else {
+			clear_nlink(inode);
+			/* stop next lookup */
+			inode->i_flags |= S_DEAD;
+		}
+		smp_mb(); /* necessary? */
+	}
+	d_drop(dentry);
+}
+
+static int au_hide_children(struct dentry *parent)
+{
+	int err, i, j, ndentry;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry *dentry;
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	err = au_dcsub_pages(&dpages, parent, NULL, NULL);
+	if (unlikely(err))
+		goto out_dpages;
+
+	/* in reverse order */
+	for (i = dpages.ndpage - 1; i >= 0; i--) {
+		dpage = dpages.dpages + i;
+		ndentry = dpage->ndentry;
+		for (j = ndentry - 1; j >= 0; j--) {
+			dentry = dpage->dentries[j];
+			if (dentry != parent)
+				au_do_hide(dentry);
+		}
+	}
+
+out_dpages:
+	au_dpages_free(&dpages);
+out:
+	return err;
+}
+
+static void au_hide(struct dentry *dentry)
+{
+	int err;
+
+	AuDbgDentry(dentry);
+	if (d_is_dir(dentry)) {
+		/* shrink_dcache_parent(dentry); */
+		err = au_hide_children(dentry);
+		if (unlikely(err))
+			AuIOErr("%pd, failed hiding children, ignored %d\n",
+				dentry, err);
+	}
+	au_do_hide(dentry);
+}
+
+/*
+ * By adding a dirty branch, a cached dentry may be affected in various ways.
+ *
+ * a dirty branch is added
+ * - on the top of layers
+ * - in the middle of layers
+ * - to the bottom of layers
+ *
+ * on the added branch there exists
+ * - a whiteout
+ * - a diropq
+ * - a same named entry
+ *   + exist
+ *     * negative --> positive
+ *     * positive --> positive
+ *	 - type is unchanged
+ *	 - type is changed
+ *   + doesn't exist
+ *     * negative --> negative
+ *     * positive --> negative (rejected by au_br_del() for non-dir case)
+ * - none
+ */
+static int au_refresh_by_dinfo(struct dentry *dentry, struct au_dinfo *dinfo,
+			       struct au_dinfo *tmp)
+{
+	int err;
+	aufs_bindex_t bindex, bend;
+	struct {
+		struct dentry *dentry;
+		struct inode *inode;
+		mode_t mode;
+	} orig_h, tmp_h = {
+		.dentry = NULL
+	};
+	struct au_hdentry *hd;
+	struct inode *inode, *h_inode;
+	struct dentry *h_dentry;
+
+	err = 0;
+	AuDebugOn(dinfo->di_bstart < 0);
+	orig_h.mode = 0;
+	orig_h.dentry = dinfo->di_hdentry[dinfo->di_bstart].hd_dentry;
+	orig_h.inode = NULL;
+	if (d_is_positive(orig_h.dentry)) {
+		orig_h.inode = d_inode(orig_h.dentry);
+		orig_h.mode = orig_h.inode->i_mode & S_IFMT;
+	}
+	if (tmp->di_bstart >= 0) {
+		tmp_h.dentry = tmp->di_hdentry[tmp->di_bstart].hd_dentry;
+		if (d_is_positive(tmp_h.dentry)) {
+			tmp_h.inode = d_inode(tmp_h.dentry);
+			tmp_h.mode = tmp_h.inode->i_mode & S_IFMT;
+		}
+	}
+
+	inode = NULL;
+	if (d_really_is_positive(dentry))
+		inode = d_inode(dentry);
+	if (!orig_h.inode) {
+		AuDbg("nagative originally\n");
+		if (inode) {
+			au_hide(dentry);
+			goto out;
+		}
+		AuDebugOn(inode);
+		AuDebugOn(dinfo->di_bstart != dinfo->di_bend);
+		AuDebugOn(dinfo->di_bdiropq != -1);
+
+		if (!tmp_h.inode) {
+			AuDbg("negative --> negative\n");
+			/* should have only one negative lower */
+			if (tmp->di_bstart >= 0
+			    && tmp->di_bstart < dinfo->di_bstart) {
+				AuDebugOn(tmp->di_bstart != tmp->di_bend);
+				AuDebugOn(dinfo->di_bstart != dinfo->di_bend);
+				au_set_h_dptr(dentry, dinfo->di_bstart, NULL);
+				au_di_cp(dinfo, tmp);
+				hd = tmp->di_hdentry + tmp->di_bstart;
+				au_set_h_dptr(dentry, tmp->di_bstart,
+					      dget(hd->hd_dentry));
+			}
+			au_dbg_verify_dinode(dentry);
+		} else {
+			AuDbg("negative --> positive\n");
+			/*
+			 * similar to the behaviour of creating with bypassing
+			 * aufs.
+			 * unhash it in order to force an error in the
+			 * succeeding create operation.
+			 * we should not set S_DEAD here.
+			 */
+			d_drop(dentry);
+			/* au_di_swap(tmp, dinfo); */
+			au_dbg_verify_dinode(dentry);
+		}
+	} else {
+		AuDbg("positive originally\n");
+		/* inode may be NULL */
+		AuDebugOn(inode && (inode->i_mode & S_IFMT) != orig_h.mode);
+		if (!tmp_h.inode) {
+			AuDbg("positive --> negative\n");
+			/* or bypassing aufs */
+			au_hide(dentry);
+			if (tmp->di_bwh >= 0 && tmp->di_bwh <= dinfo->di_bstart)
+				dinfo->di_bwh = tmp->di_bwh;
+			if (inode)
+				err = au_refresh_hinode_self(inode);
+			au_dbg_verify_dinode(dentry);
+		} else if (orig_h.mode == tmp_h.mode) {
+			AuDbg("positive --> positive, same type\n");
+			if (!S_ISDIR(orig_h.mode)
+			    && dinfo->di_bstart > tmp->di_bstart) {
+				/*
+				 * similar to the behaviour of removing and
+				 * creating.
+				 */
+				au_hide(dentry);
+				if (inode)
+					err = au_refresh_hinode_self(inode);
+				au_dbg_verify_dinode(dentry);
+			} else {
+				/* fill empty slots */
+				if (dinfo->di_bstart > tmp->di_bstart)
+					dinfo->di_bstart = tmp->di_bstart;
+				if (dinfo->di_bend < tmp->di_bend)
+					dinfo->di_bend = tmp->di_bend;
+				dinfo->di_bwh = tmp->di_bwh;
+				dinfo->di_bdiropq = tmp->di_bdiropq;
+				hd = tmp->di_hdentry;
+				bend = dinfo->di_bend;
+				for (bindex = tmp->di_bstart; bindex <= bend;
+				     bindex++) {
+					if (au_h_dptr(dentry, bindex))
+						continue;
+					h_dentry = hd[bindex].hd_dentry;
+					if (!h_dentry)
+						continue;
+					AuDebugOn(d_is_negative(h_dentry));
+					h_inode = d_inode(h_dentry);
+					AuDebugOn(orig_h.mode
+						  != (h_inode->i_mode
+						      & S_IFMT));
+					au_set_h_dptr(dentry, bindex,
+						      dget(h_dentry));
+				}
+				err = au_refresh_hinode(inode, dentry);
+				au_dbg_verify_dinode(dentry);
+			}
+		} else {
+			AuDbg("positive --> positive, different type\n");
+			/* similar to the behaviour of removing and creating */
+			au_hide(dentry);
+			if (inode)
+				err = au_refresh_hinode_self(inode);
+			au_dbg_verify_dinode(dentry);
+		}
+	}
+
+out:
+	return err;
+}
+
+void au_refresh_dop(struct dentry *dentry, int force_reval)
+{
+	const struct dentry_operations *dop
+		= force_reval ? &aufs_dop : dentry->d_sb->s_d_op;
+	static const unsigned int mask
+		= DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE;
+
+	BUILD_BUG_ON(sizeof(mask) != sizeof(dentry->d_flags));
+
+	if (dentry->d_op == dop)
+		return;
+
+	AuDbg("%pd\n", dentry);
+	spin_lock(&dentry->d_lock);
+	if (dop == &aufs_dop)
+		dentry->d_flags |= mask;
+	else
+		dentry->d_flags &= ~mask;
+	dentry->d_op = dop;
+	spin_unlock(&dentry->d_lock);
+}
+
+int au_refresh_dentry(struct dentry *dentry, struct dentry *parent)
+{
+	int err, ebrange;
+	unsigned int sigen;
+	struct au_dinfo *dinfo, *tmp;
+	struct super_block *sb;
+	struct inode *inode;
+
+	DiMustWriteLock(dentry);
+	AuDebugOn(IS_ROOT(dentry));
+	AuDebugOn(d_really_is_negative(parent));
+
+	sb = dentry->d_sb;
+	sigen = au_sigen(sb);
+	err = au_digen_test(parent, sigen);
+	if (unlikely(err))
+		goto out;
+
+	dinfo = au_di(dentry);
+	err = au_di_realloc(dinfo, au_sbend(sb) + 1);
+	if (unlikely(err))
+		goto out;
+	ebrange = au_dbrange_test(dentry);
+	if (!ebrange)
+		ebrange = au_do_refresh_hdentry(dentry, parent);
+
+	if (d_unhashed(dentry) || ebrange /* || dinfo->di_tmpfile */) {
+		AuDebugOn(au_dbstart(dentry) < 0 && au_dbend(dentry) >= 0);
+		if (d_really_is_positive(dentry)) {
+			inode = d_inode(dentry);
+			err = au_refresh_hinode_self(inode);
+		}
+		au_dbg_verify_dinode(dentry);
+		if (!err)
+			goto out_dgen; /* success */
+		goto out;
+	}
+
+	/* temporary dinfo */
+	AuDbgDentry(dentry);
+	err = -ENOMEM;
+	tmp = au_di_alloc(sb, AuLsc_DI_TMP);
+	if (unlikely(!tmp))
+		goto out;
+	au_di_swap(tmp, dinfo);
+	/* returns the number of positive dentries */
+	/*
+	 * if current working dir is removed, it returns an error.
+	 * but the dentry is legal.
+	 */
+	err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0);
+	AuDbgDentry(dentry);
+	au_di_swap(tmp, dinfo);
+	if (err == -ENOENT)
+		err = 0;
+	if (err >= 0) {
+		/* compare/refresh by dinfo */
+		AuDbgDentry(dentry);
+		err = au_refresh_by_dinfo(dentry, dinfo, tmp);
+		au_dbg_verify_dinode(dentry);
+		AuTraceErr(err);
+	}
+	au_rw_write_unlock(&tmp->di_rwsem);
+	au_di_free(tmp);
+	if (unlikely(err))
+		goto out;
+
+out_dgen:
+	au_update_digen(dentry);
+out:
+	if (unlikely(err && !(dentry->d_flags & DCACHE_NFSFS_RENAMED))) {
+		AuIOErr("failed refreshing %pd, %d\n", dentry, err);
+		AuDbgDentry(dentry);
+	}
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_do_h_d_reval(struct dentry *h_dentry, unsigned int flags,
+			   struct dentry *dentry, aufs_bindex_t bindex)
+{
+	int err, valid;
+
+	err = 0;
+	if (!(h_dentry->d_flags & DCACHE_OP_REVALIDATE))
+		goto out;
+
+	AuDbg("b%d\n", bindex);
+	/*
+	 * gave up supporting LOOKUP_CREATE/OPEN for lower fs,
+	 * due to whiteout and branch permission.
+	 */
+	flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE
+		   | LOOKUP_FOLLOW | LOOKUP_EXCL);
+	/* it may return tri-state */
+	valid = h_dentry->d_op->d_revalidate(h_dentry, flags);
+
+	if (unlikely(valid < 0))
+		err = valid;
+	else if (!valid)
+		err = -EINVAL;
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* todo: remove this */
+static int h_d_revalidate(struct dentry *dentry, struct inode *inode,
+			  unsigned int flags, int do_udba)
+{
+	int err;
+	umode_t mode, h_mode;
+	aufs_bindex_t bindex, btail, bstart, ibs, ibe;
+	unsigned char plus, unhashed, is_root, h_plus, h_nfs, tmpfile;
+	struct inode *h_inode, *h_cached_inode;
+	struct dentry *h_dentry;
+	struct qstr *name, *h_name;
+
+	err = 0;
+	plus = 0;
+	mode = 0;
+	ibs = -1;
+	ibe = -1;
+	unhashed = !!d_unhashed(dentry);
+	is_root = !!IS_ROOT(dentry);
+	name = &dentry->d_name;
+	tmpfile = au_di(dentry)->di_tmpfile;
+
+	/*
+	 * Theoretically, REVAL test should be unnecessary in case of
+	 * {FS,I}NOTIFY.
+	 * But {fs,i}notify doesn't fire some necessary events,
+	 *	IN_ATTRIB for atime/nlink/pageio
+	 * Let's do REVAL test too.
+	 */
+	if (do_udba && inode) {
+		mode = (inode->i_mode & S_IFMT);
+		plus = (inode->i_nlink > 0);
+		ibs = au_ibstart(inode);
+		ibe = au_ibend(inode);
+	}
+
+	bstart = au_dbstart(dentry);
+	btail = bstart;
+	if (inode && S_ISDIR(inode->i_mode))
+		btail = au_dbtaildir(dentry);
+	for (bindex = bstart; bindex <= btail; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+
+		AuDbg("b%d, %pd\n", bindex, h_dentry);
+		h_nfs = !!au_test_nfs(h_dentry->d_sb);
+		spin_lock(&h_dentry->d_lock);
+		h_name = &h_dentry->d_name;
+		if (unlikely(do_udba
+			     && !is_root
+			     && ((!h_nfs
+				  && (unhashed != !!d_unhashed(h_dentry)
+				      || (!tmpfile
+					  && !au_qstreq(name, h_name))
+					  ))
+				 || (h_nfs
+				     && !(flags & LOOKUP_OPEN)
+				     && (h_dentry->d_flags
+					 & DCACHE_NFSFS_RENAMED)))
+			    )) {
+			int h_unhashed;
+
+			h_unhashed = d_unhashed(h_dentry);
+			spin_unlock(&h_dentry->d_lock);
+			AuDbg("unhash 0x%x 0x%x, %pd %pd\n",
+			      unhashed, h_unhashed, dentry, h_dentry);
+			goto err;
+		}
+		spin_unlock(&h_dentry->d_lock);
+
+		err = au_do_h_d_reval(h_dentry, flags, dentry, bindex);
+		if (unlikely(err))
+			/* do not goto err, to keep the errno */
+			break;
+
+		/* todo: plink too? */
+		if (!do_udba)
+			continue;
+
+		/* UDBA tests */
+		if (unlikely(!!inode != d_is_positive(h_dentry)))
+			goto err;
+
+		h_inode = NULL;
+		if (d_is_positive(h_dentry))
+			h_inode = d_inode(h_dentry);
+		h_plus = plus;
+		h_mode = mode;
+		h_cached_inode = h_inode;
+		if (h_inode) {
+			h_mode = (h_inode->i_mode & S_IFMT);
+			h_plus = (h_inode->i_nlink > 0);
+		}
+		if (inode && ibs <= bindex && bindex <= ibe)
+			h_cached_inode = au_h_iptr(inode, bindex);
+
+		if (!h_nfs) {
+			if (unlikely(plus != h_plus && !tmpfile))
+				goto err;
+		} else {
+			if (unlikely(!(h_dentry->d_flags & DCACHE_NFSFS_RENAMED)
+				     && !is_root
+				     && !IS_ROOT(h_dentry)
+				     && unhashed != d_unhashed(h_dentry)))
+				goto err;
+		}
+		if (unlikely(mode != h_mode
+			     || h_cached_inode != h_inode))
+			goto err;
+		continue;
+
+err:
+		err = -EINVAL;
+		break;
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+/* todo: consolidate with do_refresh() and au_reval_for_attr() */
+static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen)
+{
+	int err;
+	struct dentry *parent;
+
+	if (!au_digen_test(dentry, sigen))
+		return 0;
+
+	parent = dget_parent(dentry);
+	di_read_lock_parent(parent, AuLock_IR);
+	AuDebugOn(au_digen_test(parent, sigen));
+	au_dbg_verify_gen(parent, sigen);
+	err = au_refresh_dentry(dentry, parent);
+	di_read_unlock(parent, AuLock_IR);
+	dput(parent);
+	AuTraceErr(err);
+	return err;
+}
+
+int au_reval_dpath(struct dentry *dentry, unsigned int sigen)
+{
+	int err;
+	struct dentry *d, *parent;
+
+	if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR))
+		return simple_reval_dpath(dentry, sigen);
+
+	/* slow loop, keep it simple and stupid */
+	/* cf: au_cpup_dirs() */
+	err = 0;
+	parent = NULL;
+	while (au_digen_test(dentry, sigen)) {
+		d = dentry;
+		while (1) {
+			dput(parent);
+			parent = dget_parent(d);
+			if (!au_digen_test(parent, sigen))
+				break;
+			d = parent;
+		}
+
+		if (d != dentry)
+			di_write_lock_child2(d);
+
+		/* someone might update our dentry while we were sleeping */
+		if (au_digen_test(d, sigen)) {
+			/*
+			 * todo: consolidate with simple_reval_dpath(),
+			 * do_refresh() and au_reval_for_attr().
+			 */
+			di_read_lock_parent(parent, AuLock_IR);
+			err = au_refresh_dentry(d, parent);
+			di_read_unlock(parent, AuLock_IR);
+		}
+
+		if (d != dentry)
+			di_write_unlock(d);
+		dput(parent);
+		if (unlikely(err))
+			break;
+	}
+
+	return err;
+}
+
+/*
+ * if valid returns 1, otherwise 0.
+ */
+static int aufs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	int valid, err;
+	unsigned int sigen;
+	unsigned char do_udba;
+	struct super_block *sb;
+	struct inode *inode;
+
+	/* todo: support rcu-walk? */
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	valid = 0;
+	if (unlikely(!au_di(dentry)))
+		goto out;
+
+	valid = 1;
+	sb = dentry->d_sb;
+	/*
+	 * todo: very ugly
+	 * i_mutex of parent dir may be held,
+	 * but we should not return 'invalid' due to busy.
+	 */
+	err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW | AuLock_NOPLM);
+	if (unlikely(err)) {
+		valid = err;
+		AuTraceErr(err);
+		goto out;
+	}
+	inode = NULL;
+	if (d_really_is_positive(dentry))
+		inode = d_inode(dentry);
+	if (unlikely(inode && is_bad_inode(inode))) {
+		err = -EINVAL;
+		AuTraceErr(err);
+		goto out_dgrade;
+	}
+	if (unlikely(au_dbrange_test(dentry))) {
+		err = -EINVAL;
+		AuTraceErr(err);
+		goto out_dgrade;
+	}
+
+	sigen = au_sigen(sb);
+	if (au_digen_test(dentry, sigen)) {
+		AuDebugOn(IS_ROOT(dentry));
+		err = au_reval_dpath(dentry, sigen);
+		if (unlikely(err)) {
+			AuTraceErr(err);
+			goto out_dgrade;
+		}
+	}
+	di_downgrade_lock(dentry, AuLock_IR);
+
+	err = -EINVAL;
+	if (!(flags & (LOOKUP_OPEN | LOOKUP_EMPTY))
+	    && inode
+	    && !(inode->i_state && I_LINKABLE)
+	    && (IS_DEADDIR(inode) || !inode->i_nlink)) {
+		AuTraceErr(err);
+		goto out_inval;
+	}
+
+	do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE);
+	if (do_udba && inode) {
+		aufs_bindex_t bstart = au_ibstart(inode);
+		struct inode *h_inode;
+
+		if (bstart >= 0) {
+			h_inode = au_h_iptr(inode, bstart);
+			if (h_inode && au_test_higen(inode, h_inode)) {
+				AuTraceErr(err);
+				goto out_inval;
+			}
+		}
+	}
+
+	err = h_d_revalidate(dentry, inode, flags, do_udba);
+	if (unlikely(!err && do_udba && au_dbstart(dentry) < 0)) {
+		err = -EIO;
+		AuDbg("both of real entry and whiteout found, %p, err %d\n",
+		      dentry, err);
+	}
+	goto out_inval;
+
+out_dgrade:
+	di_downgrade_lock(dentry, AuLock_IR);
+out_inval:
+	aufs_read_unlock(dentry, AuLock_IR);
+	AuTraceErr(err);
+	valid = !err;
+out:
+	if (!valid) {
+		AuDbg("%pd invalid, %d\n", dentry, valid);
+		d_drop(dentry);
+	}
+	return valid;
+}
+
+static void aufs_d_release(struct dentry *dentry)
+{
+	if (au_di(dentry)) {
+		au_di_fin(dentry);
+		au_hn_di_reinit(dentry);
+	}
+}
+
+const struct dentry_operations aufs_dop = {
+	.d_revalidate		= aufs_d_revalidate,
+	.d_weak_revalidate	= aufs_d_revalidate,
+	.d_release		= aufs_d_release
+};
+
+/* aufs_dop without d_revalidate */
+const struct dentry_operations aufs_dop_noreval = {
+	.d_release		= aufs_d_release
+};
diff --git b/fs/aufs/dentry.h b/fs/aufs/dentry.h
new file mode 100644
index 0000000..c794adf
--- /dev/null
+++ b/fs/aufs/dentry.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * lookup and dentry operations
+ */
+
+#ifndef __AUFS_DENTRY_H__
+#define __AUFS_DENTRY_H__
+
+#ifdef __KERNEL__
+
+#include <linux/dcache.h>
+#include "rwsem.h"
+
+struct au_hdentry {
+	struct dentry		*hd_dentry;
+	aufs_bindex_t		hd_id;
+};
+
+struct au_dinfo {
+	atomic_t		di_generation;
+
+	struct au_rwsem		di_rwsem;
+	aufs_bindex_t		di_bstart, di_bend, di_bwh, di_bdiropq;
+	unsigned char		di_tmpfile; /* to allow the different name */
+	struct au_hdentry	*di_hdentry;
+} ____cacheline_aligned_in_smp;
+
+/* ---------------------------------------------------------------------- */
+
+/* dentry.c */
+extern const struct dentry_operations aufs_dop, aufs_dop_noreval;
+struct au_branch;
+struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent);
+int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir,
+		struct dentry *h_parent, struct au_branch *br);
+
+int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type);
+int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh);
+int au_refresh_dentry(struct dentry *dentry, struct dentry *parent);
+int au_reval_dpath(struct dentry *dentry, unsigned int sigen);
+void au_refresh_dop(struct dentry *dentry, int force_reval);
+
+/* dinfo.c */
+void au_di_init_once(void *_di);
+struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc);
+void au_di_free(struct au_dinfo *dinfo);
+void au_di_swap(struct au_dinfo *a, struct au_dinfo *b);
+void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src);
+int au_di_init(struct dentry *dentry);
+void au_di_fin(struct dentry *dentry);
+int au_di_realloc(struct au_dinfo *dinfo, int nbr);
+
+void di_read_lock(struct dentry *d, int flags, unsigned int lsc);
+void di_read_unlock(struct dentry *d, int flags);
+void di_downgrade_lock(struct dentry *d, int flags);
+void di_write_lock(struct dentry *d, unsigned int lsc);
+void di_write_unlock(struct dentry *d);
+void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir);
+void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir);
+void di_write_unlock2(struct dentry *d1, struct dentry *d2);
+
+struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex);
+struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex);
+aufs_bindex_t au_dbtail(struct dentry *dentry);
+aufs_bindex_t au_dbtaildir(struct dentry *dentry);
+
+void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex,
+		   struct dentry *h_dentry);
+int au_digen_test(struct dentry *dentry, unsigned int sigen);
+int au_dbrange_test(struct dentry *dentry);
+void au_update_digen(struct dentry *dentry);
+void au_update_dbrange(struct dentry *dentry, int do_put_zero);
+void au_update_dbstart(struct dentry *dentry);
+void au_update_dbend(struct dentry *dentry);
+int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry);
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct au_dinfo *au_di(struct dentry *dentry)
+{
+	return dentry->d_fsdata;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* lock subclass for dinfo */
+enum {
+	AuLsc_DI_CHILD,		/* child first */
+	AuLsc_DI_CHILD2,	/* rename(2), link(2), and cpup at hnotify */
+	AuLsc_DI_CHILD3,	/* copyup dirs */
+	AuLsc_DI_PARENT,
+	AuLsc_DI_PARENT2,
+	AuLsc_DI_PARENT3,
+	AuLsc_DI_TMP		/* temp for replacing dinfo */
+};
+
+/*
+ * di_read_lock_child, di_write_lock_child,
+ * di_read_lock_child2, di_write_lock_child2,
+ * di_read_lock_child3, di_write_lock_child3,
+ * di_read_lock_parent, di_write_lock_parent,
+ * di_read_lock_parent2, di_write_lock_parent2,
+ * di_read_lock_parent3, di_write_lock_parent3,
+ */
+#define AuReadLockFunc(name, lsc) \
+static inline void di_read_lock_##name(struct dentry *d, int flags) \
+{ di_read_lock(d, flags, AuLsc_DI_##lsc); }
+
+#define AuWriteLockFunc(name, lsc) \
+static inline void di_write_lock_##name(struct dentry *d) \
+{ di_write_lock(d, AuLsc_DI_##lsc); }
+
+#define AuRWLockFuncs(name, lsc) \
+	AuReadLockFunc(name, lsc) \
+	AuWriteLockFunc(name, lsc)
+
+AuRWLockFuncs(child, CHILD);
+AuRWLockFuncs(child2, CHILD2);
+AuRWLockFuncs(child3, CHILD3);
+AuRWLockFuncs(parent, PARENT);
+AuRWLockFuncs(parent2, PARENT2);
+AuRWLockFuncs(parent3, PARENT3);
+
+#undef AuReadLockFunc
+#undef AuWriteLockFunc
+#undef AuRWLockFuncs
+
+#define DiMustNoWaiters(d)	AuRwMustNoWaiters(&au_di(d)->di_rwsem)
+#define DiMustAnyLock(d)	AuRwMustAnyLock(&au_di(d)->di_rwsem)
+#define DiMustWriteLock(d)	AuRwMustWriteLock(&au_di(d)->di_rwsem)
+
+/* ---------------------------------------------------------------------- */
+
+/* todo: memory barrier? */
+static inline unsigned int au_digen(struct dentry *d)
+{
+	return atomic_read(&au_di(d)->di_generation);
+}
+
+static inline void au_h_dentry_init(struct au_hdentry *hdentry)
+{
+	hdentry->hd_dentry = NULL;
+}
+
+static inline void au_hdput(struct au_hdentry *hd)
+{
+	if (hd)
+		dput(hd->hd_dentry);
+}
+
+static inline aufs_bindex_t au_dbstart(struct dentry *dentry)
+{
+	DiMustAnyLock(dentry);
+	return au_di(dentry)->di_bstart;
+}
+
+static inline aufs_bindex_t au_dbend(struct dentry *dentry)
+{
+	DiMustAnyLock(dentry);
+	return au_di(dentry)->di_bend;
+}
+
+static inline aufs_bindex_t au_dbwh(struct dentry *dentry)
+{
+	DiMustAnyLock(dentry);
+	return au_di(dentry)->di_bwh;
+}
+
+static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry)
+{
+	DiMustAnyLock(dentry);
+	return au_di(dentry)->di_bdiropq;
+}
+
+/* todo: hard/soft set? */
+static inline void au_set_dbstart(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	DiMustWriteLock(dentry);
+	au_di(dentry)->di_bstart = bindex;
+}
+
+static inline void au_set_dbend(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	DiMustWriteLock(dentry);
+	au_di(dentry)->di_bend = bindex;
+}
+
+static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	DiMustWriteLock(dentry);
+	/* dbwh can be outside of bstart - bend range */
+	au_di(dentry)->di_bwh = bindex;
+}
+
+static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	DiMustWriteLock(dentry);
+	au_di(dentry)->di_bdiropq = bindex;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_HNOTIFY
+static inline void au_digen_dec(struct dentry *d)
+{
+	atomic_dec(&au_di(d)->di_generation);
+}
+
+static inline void au_hn_di_reinit(struct dentry *dentry)
+{
+	dentry->d_fsdata = NULL;
+}
+#else
+AuStubVoid(au_hn_di_reinit, struct dentry *dentry __maybe_unused)
+#endif /* CONFIG_AUFS_HNOTIFY */
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_DENTRY_H__ */
diff --git b/fs/aufs/dinfo.c b/fs/aufs/dinfo.c
new file mode 100644
index 0000000..ad6d045
--- /dev/null
+++ b/fs/aufs/dinfo.c
@@ -0,0 +1,537 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * dentry private data
+ */
+
+#include "aufs.h"
+
+void au_di_init_once(void *_dinfo)
+{
+	struct au_dinfo *dinfo = _dinfo;
+	static struct lock_class_key aufs_di;
+
+	au_rw_init(&dinfo->di_rwsem);
+	au_rw_class(&dinfo->di_rwsem, &aufs_di);
+}
+
+struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc)
+{
+	struct au_dinfo *dinfo;
+	int nbr, i;
+
+	dinfo = au_cache_alloc_dinfo();
+	if (unlikely(!dinfo))
+		goto out;
+
+	nbr = au_sbend(sb) + 1;
+	if (nbr <= 0)
+		nbr = 1;
+	dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS);
+	if (dinfo->di_hdentry) {
+		au_rw_write_lock_nested(&dinfo->di_rwsem, lsc);
+		dinfo->di_bstart = -1;
+		dinfo->di_bend = -1;
+		dinfo->di_bwh = -1;
+		dinfo->di_bdiropq = -1;
+		dinfo->di_tmpfile = 0;
+		for (i = 0; i < nbr; i++)
+			dinfo->di_hdentry[i].hd_id = -1;
+		goto out;
+	}
+
+	au_cache_free_dinfo(dinfo);
+	dinfo = NULL;
+
+out:
+	return dinfo;
+}
+
+void au_di_free(struct au_dinfo *dinfo)
+{
+	struct au_hdentry *p;
+	aufs_bindex_t bend, bindex;
+
+	/* dentry may not be revalidated */
+	bindex = dinfo->di_bstart;
+	if (bindex >= 0) {
+		bend = dinfo->di_bend;
+		p = dinfo->di_hdentry + bindex;
+		while (bindex++ <= bend)
+			au_hdput(p++);
+	}
+	kfree(dinfo->di_hdentry);
+	au_cache_free_dinfo(dinfo);
+}
+
+void au_di_swap(struct au_dinfo *a, struct au_dinfo *b)
+{
+	struct au_hdentry *p;
+	aufs_bindex_t bi;
+
+	AuRwMustWriteLock(&a->di_rwsem);
+	AuRwMustWriteLock(&b->di_rwsem);
+
+#define DiSwap(v, name)				\
+	do {					\
+		v = a->di_##name;		\
+		a->di_##name = b->di_##name;	\
+		b->di_##name = v;		\
+	} while (0)
+
+	DiSwap(p, hdentry);
+	DiSwap(bi, bstart);
+	DiSwap(bi, bend);
+	DiSwap(bi, bwh);
+	DiSwap(bi, bdiropq);
+	/* smp_mb(); */
+
+#undef DiSwap
+}
+
+void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src)
+{
+	AuRwMustWriteLock(&dst->di_rwsem);
+	AuRwMustWriteLock(&src->di_rwsem);
+
+	dst->di_bstart = src->di_bstart;
+	dst->di_bend = src->di_bend;
+	dst->di_bwh = src->di_bwh;
+	dst->di_bdiropq = src->di_bdiropq;
+	/* smp_mb(); */
+}
+
+int au_di_init(struct dentry *dentry)
+{
+	int err;
+	struct super_block *sb;
+	struct au_dinfo *dinfo;
+
+	err = 0;
+	sb = dentry->d_sb;
+	dinfo = au_di_alloc(sb, AuLsc_DI_CHILD);
+	if (dinfo) {
+		atomic_set(&dinfo->di_generation, au_sigen(sb));
+		/* smp_mb(); */ /* atomic_set */
+		dentry->d_fsdata = dinfo;
+	} else
+		err = -ENOMEM;
+
+	return err;
+}
+
+void au_di_fin(struct dentry *dentry)
+{
+	struct au_dinfo *dinfo;
+
+	dinfo = au_di(dentry);
+	AuRwDestroy(&dinfo->di_rwsem);
+	au_di_free(dinfo);
+}
+
+int au_di_realloc(struct au_dinfo *dinfo, int nbr)
+{
+	int err, sz;
+	struct au_hdentry *hdp;
+
+	AuRwMustWriteLock(&dinfo->di_rwsem);
+
+	err = -ENOMEM;
+	sz = sizeof(*hdp) * (dinfo->di_bend + 1);
+	if (!sz)
+		sz = sizeof(*hdp);
+	hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS);
+	if (hdp) {
+		dinfo->di_hdentry = hdp;
+		err = 0;
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void do_ii_write_lock(struct inode *inode, unsigned int lsc)
+{
+	switch (lsc) {
+	case AuLsc_DI_CHILD:
+		ii_write_lock_child(inode);
+		break;
+	case AuLsc_DI_CHILD2:
+		ii_write_lock_child2(inode);
+		break;
+	case AuLsc_DI_CHILD3:
+		ii_write_lock_child3(inode);
+		break;
+	case AuLsc_DI_PARENT:
+		ii_write_lock_parent(inode);
+		break;
+	case AuLsc_DI_PARENT2:
+		ii_write_lock_parent2(inode);
+		break;
+	case AuLsc_DI_PARENT3:
+		ii_write_lock_parent3(inode);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void do_ii_read_lock(struct inode *inode, unsigned int lsc)
+{
+	switch (lsc) {
+	case AuLsc_DI_CHILD:
+		ii_read_lock_child(inode);
+		break;
+	case AuLsc_DI_CHILD2:
+		ii_read_lock_child2(inode);
+		break;
+	case AuLsc_DI_CHILD3:
+		ii_read_lock_child3(inode);
+		break;
+	case AuLsc_DI_PARENT:
+		ii_read_lock_parent(inode);
+		break;
+	case AuLsc_DI_PARENT2:
+		ii_read_lock_parent2(inode);
+		break;
+	case AuLsc_DI_PARENT3:
+		ii_read_lock_parent3(inode);
+		break;
+	default:
+		BUG();
+	}
+}
+
+void di_read_lock(struct dentry *d, int flags, unsigned int lsc)
+{
+	struct inode *inode;
+
+	au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc);
+	if (d_really_is_positive(d)) {
+		inode = d_inode(d);
+		if (au_ftest_lock(flags, IW))
+			do_ii_write_lock(inode, lsc);
+		else if (au_ftest_lock(flags, IR))
+			do_ii_read_lock(inode, lsc);
+	}
+}
+
+void di_read_unlock(struct dentry *d, int flags)
+{
+	struct inode *inode;
+
+	if (d_really_is_positive(d)) {
+		inode = d_inode(d);
+		if (au_ftest_lock(flags, IW)) {
+			au_dbg_verify_dinode(d);
+			ii_write_unlock(inode);
+		} else if (au_ftest_lock(flags, IR)) {
+			au_dbg_verify_dinode(d);
+			ii_read_unlock(inode);
+		}
+	}
+	au_rw_read_unlock(&au_di(d)->di_rwsem);
+}
+
+void di_downgrade_lock(struct dentry *d, int flags)
+{
+	if (d_really_is_positive(d) && au_ftest_lock(flags, IR))
+		ii_downgrade_lock(d_inode(d));
+	au_rw_dgrade_lock(&au_di(d)->di_rwsem);
+}
+
+void di_write_lock(struct dentry *d, unsigned int lsc)
+{
+	au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc);
+	if (d_really_is_positive(d))
+		do_ii_write_lock(d_inode(d), lsc);
+}
+
+void di_write_unlock(struct dentry *d)
+{
+	au_dbg_verify_dinode(d);
+	if (d_really_is_positive(d))
+		ii_write_unlock(d_inode(d));
+	au_rw_write_unlock(&au_di(d)->di_rwsem);
+}
+
+void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir)
+{
+	AuDebugOn(d1 == d2
+		  || d_inode(d1) == d_inode(d2)
+		  || d1->d_sb != d2->d_sb);
+
+	if (isdir && au_test_subdir(d1, d2)) {
+		di_write_lock_child(d1);
+		di_write_lock_child2(d2);
+	} else {
+		/* there should be no races */
+		di_write_lock_child(d2);
+		di_write_lock_child2(d1);
+	}
+}
+
+void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir)
+{
+	AuDebugOn(d1 == d2
+		  || d_inode(d1) == d_inode(d2)
+		  || d1->d_sb != d2->d_sb);
+
+	if (isdir && au_test_subdir(d1, d2)) {
+		di_write_lock_parent(d1);
+		di_write_lock_parent2(d2);
+	} else {
+		/* there should be no races */
+		di_write_lock_parent(d2);
+		di_write_lock_parent2(d1);
+	}
+}
+
+void di_write_unlock2(struct dentry *d1, struct dentry *d2)
+{
+	di_write_unlock(d1);
+	if (d_inode(d1) == d_inode(d2))
+		au_rw_write_unlock(&au_di(d2)->di_rwsem);
+	else
+		di_write_unlock(d2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	struct dentry *d;
+
+	DiMustAnyLock(dentry);
+
+	if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry))
+		return NULL;
+	AuDebugOn(bindex < 0);
+	d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry;
+	AuDebugOn(d && au_dcount(d) <= 0);
+	return d;
+}
+
+/*
+ * extended version of au_h_dptr().
+ * returns a hashed and positive (or linkable) h_dentry in bindex, NULL, or
+ * error.
+ */
+struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	struct dentry *h_dentry;
+	struct inode *inode, *h_inode;
+
+	AuDebugOn(d_really_is_negative(dentry));
+
+	h_dentry = NULL;
+	if (au_dbstart(dentry) <= bindex
+	    && bindex <= au_dbend(dentry))
+		h_dentry = au_h_dptr(dentry, bindex);
+	if (h_dentry && !au_d_linkable(h_dentry)) {
+		dget(h_dentry);
+		goto out; /* success */
+	}
+
+	inode = d_inode(dentry);
+	AuDebugOn(bindex < au_ibstart(inode));
+	AuDebugOn(au_ibend(inode) < bindex);
+	h_inode = au_h_iptr(inode, bindex);
+	h_dentry = d_find_alias(h_inode);
+	if (h_dentry) {
+		if (!IS_ERR(h_dentry)) {
+			if (!au_d_linkable(h_dentry))
+				goto out; /* success */
+			dput(h_dentry);
+		} else
+			goto out;
+	}
+
+	if (au_opt_test(au_mntflags(dentry->d_sb), PLINK)) {
+		h_dentry = au_plink_lkup(inode, bindex);
+		AuDebugOn(!h_dentry);
+		if (!IS_ERR(h_dentry)) {
+			if (!au_d_hashed_positive(h_dentry))
+				goto out; /* success */
+			dput(h_dentry);
+			h_dentry = NULL;
+		}
+	}
+
+out:
+	AuDbgDentry(h_dentry);
+	return h_dentry;
+}
+
+aufs_bindex_t au_dbtail(struct dentry *dentry)
+{
+	aufs_bindex_t bend, bwh;
+
+	bend = au_dbend(dentry);
+	if (0 <= bend) {
+		bwh = au_dbwh(dentry);
+		if (!bwh)
+			return bwh;
+		if (0 < bwh && bwh < bend)
+			return bwh - 1;
+	}
+	return bend;
+}
+
+aufs_bindex_t au_dbtaildir(struct dentry *dentry)
+{
+	aufs_bindex_t bend, bopq;
+
+	bend = au_dbtail(dentry);
+	if (0 <= bend) {
+		bopq = au_dbdiropq(dentry);
+		if (0 <= bopq && bopq < bend)
+			bend = bopq;
+	}
+	return bend;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex,
+		   struct dentry *h_dentry)
+{
+	struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex;
+	struct au_branch *br;
+
+	DiMustWriteLock(dentry);
+
+	au_hdput(hd);
+	hd->hd_dentry = h_dentry;
+	if (h_dentry) {
+		br = au_sbr(dentry->d_sb, bindex);
+		hd->hd_id = br->br_id;
+	}
+}
+
+int au_dbrange_test(struct dentry *dentry)
+{
+	int err;
+	aufs_bindex_t bstart, bend;
+
+	err = 0;
+	bstart = au_dbstart(dentry);
+	bend = au_dbend(dentry);
+	if (bstart >= 0)
+		AuDebugOn(bend < 0 && bstart > bend);
+	else {
+		err = -EIO;
+		AuDebugOn(bend >= 0);
+	}
+
+	return err;
+}
+
+int au_digen_test(struct dentry *dentry, unsigned int sigen)
+{
+	int err;
+
+	err = 0;
+	if (unlikely(au_digen(dentry) != sigen
+		     || au_iigen_test(d_inode(dentry), sigen)))
+		err = -EIO;
+
+	return err;
+}
+
+void au_update_digen(struct dentry *dentry)
+{
+	atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb));
+	/* smp_mb(); */ /* atomic_set */
+}
+
+void au_update_dbrange(struct dentry *dentry, int do_put_zero)
+{
+	struct au_dinfo *dinfo;
+	struct dentry *h_d;
+	struct au_hdentry *hdp;
+
+	DiMustWriteLock(dentry);
+
+	dinfo = au_di(dentry);
+	if (!dinfo || dinfo->di_bstart < 0)
+		return;
+
+	hdp = dinfo->di_hdentry;
+	if (do_put_zero) {
+		aufs_bindex_t bindex, bend;
+
+		bend = dinfo->di_bend;
+		for (bindex = dinfo->di_bstart; bindex <= bend; bindex++) {
+			h_d = hdp[0 + bindex].hd_dentry;
+			if (h_d && d_is_negative(h_d))
+				au_set_h_dptr(dentry, bindex, NULL);
+		}
+	}
+
+	dinfo->di_bstart = -1;
+	while (++dinfo->di_bstart <= dinfo->di_bend)
+		if (hdp[0 + dinfo->di_bstart].hd_dentry)
+			break;
+	if (dinfo->di_bstart > dinfo->di_bend) {
+		dinfo->di_bstart = -1;
+		dinfo->di_bend = -1;
+		return;
+	}
+
+	dinfo->di_bend++;
+	while (0 <= --dinfo->di_bend)
+		if (hdp[0 + dinfo->di_bend].hd_dentry)
+			break;
+	AuDebugOn(dinfo->di_bstart > dinfo->di_bend || dinfo->di_bend < 0);
+}
+
+void au_update_dbstart(struct dentry *dentry)
+{
+	aufs_bindex_t bindex, bend;
+	struct dentry *h_dentry;
+
+	bend = au_dbend(dentry);
+	for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+		if (d_is_positive(h_dentry)) {
+			au_set_dbstart(dentry, bindex);
+			return;
+		}
+		au_set_h_dptr(dentry, bindex, NULL);
+	}
+}
+
+void au_update_dbend(struct dentry *dentry)
+{
+	aufs_bindex_t bindex, bstart;
+	struct dentry *h_dentry;
+
+	bstart = au_dbstart(dentry);
+	for (bindex = au_dbend(dentry); bindex >= bstart; bindex--) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+		if (d_is_positive(h_dentry)) {
+			au_set_dbend(dentry, bindex);
+			return;
+		}
+		au_set_h_dptr(dentry, bindex, NULL);
+	}
+}
+
+int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry)
+{
+	aufs_bindex_t bindex, bend;
+
+	bend = au_dbend(dentry);
+	for (bindex = au_dbstart(dentry); bindex <= bend; bindex++)
+		if (au_h_dptr(dentry, bindex) == h_dentry)
+			return bindex;
+	return -1;
+}
diff --git b/fs/aufs/dir.c b/fs/aufs/dir.c
new file mode 100644
index 0000000..4644de4
--- /dev/null
+++ b/fs/aufs/dir.c
@@ -0,0 +1,743 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * directory operations
+ */
+
+#include <linux/fs_stack.h>
+#include "aufs.h"
+
+void au_add_nlink(struct inode *dir, struct inode *h_dir)
+{
+	unsigned int nlink;
+
+	AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode));
+
+	nlink = dir->i_nlink;
+	nlink += h_dir->i_nlink - 2;
+	if (h_dir->i_nlink < 2)
+		nlink += 2;
+	smp_mb(); /* for i_nlink */
+	/* 0 can happen in revaliding */
+	set_nlink(dir, nlink);
+}
+
+void au_sub_nlink(struct inode *dir, struct inode *h_dir)
+{
+	unsigned int nlink;
+
+	AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode));
+
+	nlink = dir->i_nlink;
+	nlink -= h_dir->i_nlink - 2;
+	if (h_dir->i_nlink < 2)
+		nlink -= 2;
+	smp_mb(); /* for i_nlink */
+	/* nlink == 0 means the branch-fs is broken */
+	set_nlink(dir, nlink);
+}
+
+loff_t au_dir_size(struct file *file, struct dentry *dentry)
+{
+	loff_t sz;
+	aufs_bindex_t bindex, bend;
+	struct file *h_file;
+	struct dentry *h_dentry;
+
+	sz = 0;
+	if (file) {
+		AuDebugOn(!d_is_dir(file->f_path.dentry));
+
+		bend = au_fbend_dir(file);
+		for (bindex = au_fbstart(file);
+		     bindex <= bend && sz < KMALLOC_MAX_SIZE;
+		     bindex++) {
+			h_file = au_hf_dir(file, bindex);
+			if (h_file && file_inode(h_file))
+				sz += vfsub_f_size_read(h_file);
+		}
+	} else {
+		AuDebugOn(!dentry);
+		AuDebugOn(!d_is_dir(dentry));
+
+		bend = au_dbtaildir(dentry);
+		for (bindex = au_dbstart(dentry);
+		     bindex <= bend && sz < KMALLOC_MAX_SIZE;
+		     bindex++) {
+			h_dentry = au_h_dptr(dentry, bindex);
+			if (h_dentry && d_is_positive(h_dentry))
+				sz += i_size_read(d_inode(h_dentry));
+		}
+	}
+	if (sz < KMALLOC_MAX_SIZE)
+		sz = roundup_pow_of_two(sz);
+	if (sz > KMALLOC_MAX_SIZE)
+		sz = KMALLOC_MAX_SIZE;
+	else if (sz < NAME_MAX) {
+		BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX);
+		sz = AUFS_RDBLK_DEF;
+	}
+	return sz;
+}
+
+struct au_dir_ts_arg {
+	struct dentry *dentry;
+	aufs_bindex_t brid;
+};
+
+static void au_do_dir_ts(void *arg)
+{
+	struct au_dir_ts_arg *a = arg;
+	struct au_dtime dt;
+	struct path h_path;
+	struct inode *dir, *h_dir;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct au_hinode *hdir;
+	int err;
+	aufs_bindex_t bstart, bindex;
+
+	sb = a->dentry->d_sb;
+	if (d_really_is_negative(a->dentry))
+		goto out;
+	/* no dir->i_mutex lock */
+	aufs_read_lock(a->dentry, AuLock_DW); /* noflush */
+
+	dir = d_inode(a->dentry);
+	bstart = au_ibstart(dir);
+	bindex = au_br_index(sb, a->brid);
+	if (bindex < bstart)
+		goto out_unlock;
+
+	br = au_sbr(sb, bindex);
+	h_path.dentry = au_h_dptr(a->dentry, bindex);
+	if (!h_path.dentry)
+		goto out_unlock;
+	h_path.mnt = au_br_mnt(br);
+	au_dtime_store(&dt, a->dentry, &h_path);
+
+	br = au_sbr(sb, bstart);
+	if (!au_br_writable(br->br_perm))
+		goto out_unlock;
+	h_path.dentry = au_h_dptr(a->dentry, bstart);
+	h_path.mnt = au_br_mnt(br);
+	err = vfsub_mnt_want_write(h_path.mnt);
+	if (err)
+		goto out_unlock;
+	hdir = au_hi(dir, bstart);
+	au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
+	h_dir = au_h_iptr(dir, bstart);
+	if (h_dir->i_nlink
+	    && timespec_compare(&h_dir->i_mtime, &dt.dt_mtime) < 0) {
+		dt.dt_h_path = h_path;
+		au_dtime_revert(&dt);
+	}
+	au_hn_imtx_unlock(hdir);
+	vfsub_mnt_drop_write(h_path.mnt);
+	au_cpup_attr_timesizes(dir);
+
+out_unlock:
+	aufs_read_unlock(a->dentry, AuLock_DW);
+out:
+	dput(a->dentry);
+	au_nwt_done(&au_sbi(sb)->si_nowait);
+	kfree(arg);
+}
+
+void au_dir_ts(struct inode *dir, aufs_bindex_t bindex)
+{
+	int perm, wkq_err;
+	aufs_bindex_t bstart;
+	struct au_dir_ts_arg *arg;
+	struct dentry *dentry;
+	struct super_block *sb;
+
+	IMustLock(dir);
+
+	dentry = d_find_any_alias(dir);
+	AuDebugOn(!dentry);
+	sb = dentry->d_sb;
+	bstart = au_ibstart(dir);
+	if (bstart == bindex) {
+		au_cpup_attr_timesizes(dir);
+		goto out;
+	}
+
+	perm = au_sbr_perm(sb, bstart);
+	if (!au_br_writable(perm))
+		goto out;
+
+	arg = kmalloc(sizeof(*arg), GFP_NOFS);
+	if (!arg)
+		goto out;
+
+	arg->dentry = dget(dentry); /* will be dput-ted by au_do_dir_ts() */
+	arg->brid = au_sbr_id(sb, bindex);
+	wkq_err = au_wkq_nowait(au_do_dir_ts, arg, sb, /*flags*/0);
+	if (unlikely(wkq_err)) {
+		pr_err("wkq %d\n", wkq_err);
+		dput(dentry);
+		kfree(arg);
+	}
+
+out:
+	dput(dentry);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int reopen_dir(struct file *file)
+{
+	int err;
+	unsigned int flags;
+	aufs_bindex_t bindex, btail, bstart;
+	struct dentry *dentry, *h_dentry;
+	struct file *h_file;
+
+	/* open all lower dirs */
+	dentry = file->f_path.dentry;
+	bstart = au_dbstart(dentry);
+	for (bindex = au_fbstart(file); bindex < bstart; bindex++)
+		au_set_h_fptr(file, bindex, NULL);
+	au_set_fbstart(file, bstart);
+
+	btail = au_dbtaildir(dentry);
+	for (bindex = au_fbend_dir(file); btail < bindex; bindex--)
+		au_set_h_fptr(file, bindex, NULL);
+	au_set_fbend_dir(file, btail);
+
+	flags = vfsub_file_flags(file);
+	for (bindex = bstart; bindex <= btail; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+		h_file = au_hf_dir(file, bindex);
+		if (h_file)
+			continue;
+
+		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
+		err = PTR_ERR(h_file);
+		if (IS_ERR(h_file))
+			goto out; /* close all? */
+		au_set_h_fptr(file, bindex, h_file);
+	}
+	au_update_figen(file);
+	/* todo: necessary? */
+	/* file->f_ra = h_file->f_ra; */
+	err = 0;
+
+out:
+	return err;
+}
+
+static int do_open_dir(struct file *file, int flags, struct file *h_file)
+{
+	int err;
+	aufs_bindex_t bindex, btail;
+	struct dentry *dentry, *h_dentry;
+	struct vfsmount *mnt;
+
+	FiMustWriteLock(file);
+	AuDebugOn(h_file);
+
+	err = 0;
+	mnt = file->f_path.mnt;
+	dentry = file->f_path.dentry;
+	file->f_version = d_inode(dentry)->i_version;
+	bindex = au_dbstart(dentry);
+	au_set_fbstart(file, bindex);
+	btail = au_dbtaildir(dentry);
+	au_set_fbend_dir(file, btail);
+	for (; !err && bindex <= btail; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+
+		err = vfsub_test_mntns(mnt, h_dentry->d_sb);
+		if (unlikely(err))
+			break;
+		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
+		if (IS_ERR(h_file)) {
+			err = PTR_ERR(h_file);
+			break;
+		}
+		au_set_h_fptr(file, bindex, h_file);
+	}
+	au_update_figen(file);
+	/* todo: necessary? */
+	/* file->f_ra = h_file->f_ra; */
+	if (!err)
+		return 0; /* success */
+
+	/* close all */
+	for (bindex = au_fbstart(file); bindex <= btail; bindex++)
+		au_set_h_fptr(file, bindex, NULL);
+	au_set_fbstart(file, -1);
+	au_set_fbend_dir(file, -1);
+
+	return err;
+}
+
+static int aufs_open_dir(struct inode *inode __maybe_unused,
+			 struct file *file)
+{
+	int err;
+	struct super_block *sb;
+	struct au_fidir *fidir;
+
+	err = -ENOMEM;
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	fidir = au_fidir_alloc(sb);
+	if (fidir) {
+		struct au_do_open_args args = {
+			.open	= do_open_dir,
+			.fidir	= fidir
+		};
+		err = au_do_open(file, &args);
+		if (unlikely(err))
+			kfree(fidir);
+	}
+	si_read_unlock(sb);
+	return err;
+}
+
+static int aufs_release_dir(struct inode *inode __maybe_unused,
+			    struct file *file)
+{
+	struct au_vdir *vdir_cache;
+	struct au_finfo *finfo;
+	struct au_fidir *fidir;
+	aufs_bindex_t bindex, bend;
+
+	finfo = au_fi(file);
+	fidir = finfo->fi_hdir;
+	if (fidir) {
+		au_sphl_del(&finfo->fi_hlist,
+			    &au_sbi(file->f_path.dentry->d_sb)->si_files);
+		vdir_cache = fidir->fd_vdir_cache; /* lock-free */
+		if (vdir_cache)
+			au_vdir_free(vdir_cache);
+
+		bindex = finfo->fi_btop;
+		if (bindex >= 0) {
+			/*
+			 * calls fput() instead of filp_close(),
+			 * since no dnotify or lock for the lower file.
+			 */
+			bend = fidir->fd_bbot;
+			for (; bindex <= bend; bindex++)
+				au_set_h_fptr(file, bindex, NULL);
+		}
+		kfree(fidir);
+		finfo->fi_hdir = NULL;
+	}
+	au_finfo_fin(file);
+	return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_do_flush_dir(struct file *file, fl_owner_t id)
+{
+	int err;
+	aufs_bindex_t bindex, bend;
+	struct file *h_file;
+
+	err = 0;
+	bend = au_fbend_dir(file);
+	for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) {
+		h_file = au_hf_dir(file, bindex);
+		if (h_file)
+			err = vfsub_flush(h_file, id);
+	}
+	return err;
+}
+
+static int aufs_flush_dir(struct file *file, fl_owner_t id)
+{
+	return au_do_flush(file, id, au_do_flush_dir);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync)
+{
+	int err;
+	aufs_bindex_t bend, bindex;
+	struct inode *inode;
+	struct super_block *sb;
+
+	err = 0;
+	sb = dentry->d_sb;
+	inode = d_inode(dentry);
+	IMustLock(inode);
+	bend = au_dbend(dentry);
+	for (bindex = au_dbstart(dentry); !err && bindex <= bend; bindex++) {
+		struct path h_path;
+
+		if (au_test_ro(sb, bindex, inode))
+			continue;
+		h_path.dentry = au_h_dptr(dentry, bindex);
+		if (!h_path.dentry)
+			continue;
+
+		h_path.mnt = au_sbr_mnt(sb, bindex);
+		err = vfsub_fsync(NULL, &h_path, datasync);
+	}
+
+	return err;
+}
+
+static int au_do_fsync_dir(struct file *file, int datasync)
+{
+	int err;
+	aufs_bindex_t bend, bindex;
+	struct file *h_file;
+	struct super_block *sb;
+	struct inode *inode;
+
+	err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1);
+	if (unlikely(err))
+		goto out;
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	bend = au_fbend_dir(file);
+	for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) {
+		h_file = au_hf_dir(file, bindex);
+		if (!h_file || au_test_ro(sb, bindex, inode))
+			continue;
+
+		err = vfsub_fsync(h_file, &h_file->f_path, datasync);
+	}
+
+out:
+	return err;
+}
+
+/*
+ * @file may be NULL
+ */
+static int aufs_fsync_dir(struct file *file, loff_t start, loff_t end,
+			  int datasync)
+{
+	int err;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct super_block *sb;
+
+	err = 0;
+	dentry = file->f_path.dentry;
+	inode = d_inode(dentry);
+	inode_lock(inode);
+	sb = dentry->d_sb;
+	si_noflush_read_lock(sb);
+	if (file)
+		err = au_do_fsync_dir(file, datasync);
+	else {
+		di_write_lock_child(dentry);
+		err = au_do_fsync_dir_no_file(dentry, datasync);
+	}
+	au_cpup_attr_timesizes(inode);
+	di_write_unlock(dentry);
+	if (file)
+		fi_write_unlock(file);
+
+	si_read_unlock(sb);
+	inode_unlock(inode);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_iterate(struct file *file, struct dir_context *ctx)
+{
+	int err;
+	struct dentry *dentry;
+	struct inode *inode, *h_inode;
+	struct super_block *sb;
+
+	AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos);
+
+	dentry = file->f_path.dentry;
+	inode = d_inode(dentry);
+	IMustLock(inode);
+
+	sb = dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1);
+	if (unlikely(err))
+		goto out;
+	err = au_alive_dir(dentry);
+	if (!err)
+		err = au_vdir_init(file);
+	di_downgrade_lock(dentry, AuLock_IR);
+	if (unlikely(err))
+		goto out_unlock;
+
+	h_inode = au_h_iptr(inode, au_ibstart(inode));
+	if (!au_test_nfsd()) {
+		err = au_vdir_fill_de(file, ctx);
+		fsstack_copy_attr_atime(inode, h_inode);
+	} else {
+		/*
+		 * nfsd filldir may call lookup_one_len(), vfs_getattr(),
+		 * encode_fh() and others.
+		 */
+		atomic_inc(&h_inode->i_count);
+		di_read_unlock(dentry, AuLock_IR);
+		si_read_unlock(sb);
+		err = au_vdir_fill_de(file, ctx);
+		fsstack_copy_attr_atime(inode, h_inode);
+		fi_write_unlock(file);
+		iput(h_inode);
+
+		AuTraceErr(err);
+		return err;
+	}
+
+out_unlock:
+	di_read_unlock(dentry, AuLock_IR);
+	fi_write_unlock(file);
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#define AuTestEmpty_WHONLY	1
+#define AuTestEmpty_CALLED	(1 << 1)
+#define AuTestEmpty_SHWH	(1 << 2)
+#define au_ftest_testempty(flags, name)	((flags) & AuTestEmpty_##name)
+#define au_fset_testempty(flags, name) \
+	do { (flags) |= AuTestEmpty_##name; } while (0)
+#define au_fclr_testempty(flags, name) \
+	do { (flags) &= ~AuTestEmpty_##name; } while (0)
+
+#ifndef CONFIG_AUFS_SHWH
+#undef AuTestEmpty_SHWH
+#define AuTestEmpty_SHWH	0
+#endif
+
+struct test_empty_arg {
+	struct dir_context ctx;
+	struct au_nhash *whlist;
+	unsigned int flags;
+	int err;
+	aufs_bindex_t bindex;
+};
+
+static int test_empty_cb(struct dir_context *ctx, const char *__name,
+			 int namelen, loff_t offset __maybe_unused, u64 ino,
+			 unsigned int d_type)
+{
+	struct test_empty_arg *arg = container_of(ctx, struct test_empty_arg,
+						  ctx);
+	char *name = (void *)__name;
+
+	arg->err = 0;
+	au_fset_testempty(arg->flags, CALLED);
+	/* smp_mb(); */
+	if (name[0] == '.'
+	    && (namelen == 1 || (name[1] == '.' && namelen == 2)))
+		goto out; /* success */
+
+	if (namelen <= AUFS_WH_PFX_LEN
+	    || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
+		if (au_ftest_testempty(arg->flags, WHONLY)
+		    && !au_nhash_test_known_wh(arg->whlist, name, namelen))
+			arg->err = -ENOTEMPTY;
+		goto out;
+	}
+
+	name += AUFS_WH_PFX_LEN;
+	namelen -= AUFS_WH_PFX_LEN;
+	if (!au_nhash_test_known_wh(arg->whlist, name, namelen))
+		arg->err = au_nhash_append_wh
+			(arg->whlist, name, namelen, ino, d_type, arg->bindex,
+			 au_ftest_testempty(arg->flags, SHWH));
+
+out:
+	/* smp_mb(); */
+	AuTraceErr(arg->err);
+	return arg->err;
+}
+
+static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg)
+{
+	int err;
+	struct file *h_file;
+
+	h_file = au_h_open(dentry, arg->bindex,
+			   O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE,
+			   /*file*/NULL, /*force_wr*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = 0;
+	if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE)
+	    && !file_inode(h_file)->i_nlink)
+		goto out_put;
+
+	do {
+		arg->err = 0;
+		au_fclr_testempty(arg->flags, CALLED);
+		/* smp_mb(); */
+		err = vfsub_iterate_dir(h_file, &arg->ctx);
+		if (err >= 0)
+			err = arg->err;
+	} while (!err && au_ftest_testempty(arg->flags, CALLED));
+
+out_put:
+	fput(h_file);
+	au_sbr_put(dentry->d_sb, arg->bindex);
+out:
+	return err;
+}
+
+struct do_test_empty_args {
+	int *errp;
+	struct dentry *dentry;
+	struct test_empty_arg *arg;
+};
+
+static void call_do_test_empty(void *args)
+{
+	struct do_test_empty_args *a = args;
+	*a->errp = do_test_empty(a->dentry, a->arg);
+}
+
+static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg)
+{
+	int err, wkq_err;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+
+	h_dentry = au_h_dptr(dentry, arg->bindex);
+	h_inode = d_inode(h_dentry);
+	/* todo: i_mode changes anytime? */
+	inode_lock_nested(h_inode, AuLsc_I_CHILD);
+	err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ);
+	inode_unlock(h_inode);
+	if (!err)
+		err = do_test_empty(dentry, arg);
+	else {
+		struct do_test_empty_args args = {
+			.errp	= &err,
+			.dentry	= dentry,
+			.arg	= arg
+		};
+		unsigned int flags = arg->flags;
+
+		wkq_err = au_wkq_wait(call_do_test_empty, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+		arg->flags = flags;
+	}
+
+	return err;
+}
+
+int au_test_empty_lower(struct dentry *dentry)
+{
+	int err;
+	unsigned int rdhash;
+	aufs_bindex_t bindex, bstart, btail;
+	struct au_nhash whlist;
+	struct test_empty_arg arg = {
+		.ctx = {
+			.actor = test_empty_cb
+		}
+	};
+	int (*test_empty)(struct dentry *dentry, struct test_empty_arg *arg);
+
+	SiMustAnyLock(dentry->d_sb);
+
+	rdhash = au_sbi(dentry->d_sb)->si_rdhash;
+	if (!rdhash)
+		rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry));
+	err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+
+	arg.flags = 0;
+	arg.whlist = &whlist;
+	bstart = au_dbstart(dentry);
+	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH))
+		au_fset_testempty(arg.flags, SHWH);
+	test_empty = do_test_empty;
+	if (au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1))
+		test_empty = sio_test_empty;
+	arg.bindex = bstart;
+	err = test_empty(dentry, &arg);
+	if (unlikely(err))
+		goto out_whlist;
+
+	au_fset_testempty(arg.flags, WHONLY);
+	btail = au_dbtaildir(dentry);
+	for (bindex = bstart + 1; !err && bindex <= btail; bindex++) {
+		struct dentry *h_dentry;
+
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (h_dentry && d_is_positive(h_dentry)) {
+			arg.bindex = bindex;
+			err = test_empty(dentry, &arg);
+		}
+	}
+
+out_whlist:
+	au_nhash_wh_free(&whlist);
+out:
+	return err;
+}
+
+int au_test_empty(struct dentry *dentry, struct au_nhash *whlist)
+{
+	int err;
+	struct test_empty_arg arg = {
+		.ctx = {
+			.actor = test_empty_cb
+		}
+	};
+	aufs_bindex_t bindex, btail;
+
+	err = 0;
+	arg.whlist = whlist;
+	arg.flags = AuTestEmpty_WHONLY;
+	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH))
+		au_fset_testempty(arg.flags, SHWH);
+	btail = au_dbtaildir(dentry);
+	for (bindex = au_dbstart(dentry); !err && bindex <= btail; bindex++) {
+		struct dentry *h_dentry;
+
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (h_dentry && d_is_positive(h_dentry)) {
+			arg.bindex = bindex;
+			err = sio_test_empty(dentry, &arg);
+		}
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+const struct file_operations aufs_dir_fop = {
+	.owner		= THIS_MODULE,
+	.llseek		= default_llseek,
+	.read		= generic_read_dir,
+	.iterate	= aufs_iterate,
+	.unlocked_ioctl	= aufs_ioctl_dir,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= aufs_compat_ioctl_dir,
+#endif
+	.open		= aufs_open_dir,
+	.release	= aufs_release_dir,
+	.flush		= aufs_flush_dir,
+	.fsync		= aufs_fsync_dir
+};
diff --git b/fs/aufs/dir.h b/fs/aufs/dir.h
new file mode 100644
index 0000000..b0a79d7
--- /dev/null
+++ b/fs/aufs/dir.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * directory operations
+ */
+
+#ifndef __AUFS_DIR_H__
+#define __AUFS_DIR_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+
+/* ---------------------------------------------------------------------- */
+
+/* need to be faster and smaller */
+
+struct au_nhash {
+	unsigned int		nh_num;
+	struct hlist_head	*nh_head;
+};
+
+struct au_vdir_destr {
+	unsigned char	len;
+	unsigned char	name[0];
+} __packed;
+
+struct au_vdir_dehstr {
+	struct hlist_node	hash;
+	struct au_vdir_destr	*str;
+} ____cacheline_aligned_in_smp;
+
+struct au_vdir_de {
+	ino_t			de_ino;
+	unsigned char		de_type;
+	/* caution: packed */
+	struct au_vdir_destr	de_str;
+} __packed;
+
+struct au_vdir_wh {
+	struct hlist_node	wh_hash;
+#ifdef CONFIG_AUFS_SHWH
+	ino_t			wh_ino;
+	aufs_bindex_t		wh_bindex;
+	unsigned char		wh_type;
+#else
+	aufs_bindex_t		wh_bindex;
+#endif
+	/* caution: packed */
+	struct au_vdir_destr	wh_str;
+} __packed;
+
+union au_vdir_deblk_p {
+	unsigned char		*deblk;
+	struct au_vdir_de	*de;
+};
+
+struct au_vdir {
+	unsigned char	**vd_deblk;
+	unsigned long	vd_nblk;
+	struct {
+		unsigned long		ul;
+		union au_vdir_deblk_p	p;
+	} vd_last;
+
+	unsigned long	vd_version;
+	unsigned int	vd_deblk_sz;
+	unsigned long	vd_jiffy;
+} ____cacheline_aligned_in_smp;
+
+/* ---------------------------------------------------------------------- */
+
+/* dir.c */
+extern const struct file_operations aufs_dir_fop;
+void au_add_nlink(struct inode *dir, struct inode *h_dir);
+void au_sub_nlink(struct inode *dir, struct inode *h_dir);
+loff_t au_dir_size(struct file *file, struct dentry *dentry);
+void au_dir_ts(struct inode *dir, aufs_bindex_t bsrc);
+int au_test_empty_lower(struct dentry *dentry);
+int au_test_empty(struct dentry *dentry, struct au_nhash *whlist);
+
+/* vdir.c */
+unsigned int au_rdhash_est(loff_t sz);
+int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp);
+void au_nhash_wh_free(struct au_nhash *whlist);
+int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt,
+			    int limit);
+int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen);
+int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino,
+		       unsigned int d_type, aufs_bindex_t bindex,
+		       unsigned char shwh);
+void au_vdir_free(struct au_vdir *vdir);
+int au_vdir_init(struct file *file);
+int au_vdir_fill_de(struct file *file, struct dir_context *ctx);
+
+/* ioctl.c */
+long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg);
+
+#ifdef CONFIG_AUFS_RDU
+/* rdu.c */
+long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+#ifdef CONFIG_COMPAT
+long au_rdu_compat_ioctl(struct file *file, unsigned int cmd,
+			 unsigned long arg);
+#endif
+#else
+AuStub(long, au_rdu_ioctl, return -EINVAL, struct file *file,
+       unsigned int cmd, unsigned long arg)
+#ifdef CONFIG_COMPAT
+AuStub(long, au_rdu_compat_ioctl, return -EINVAL, struct file *file,
+       unsigned int cmd, unsigned long arg)
+#endif
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_DIR_H__ */
diff --git b/fs/aufs/dynop.c b/fs/aufs/dynop.c
new file mode 100644
index 0000000..53a8b55
--- /dev/null
+++ b/fs/aufs/dynop.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright (C) 2010-2016 Junjiro R. Okajima
+ */
+
+/*
+ * dynamically customizable operations for regular files
+ */
+
+#include "aufs.h"
+
+#define DyPrSym(key)	AuDbgSym(key->dk_op.dy_hop)
+
+/*
+ * How large will these lists be?
+ * Usually just a few elements, 20-30 at most for each, I guess.
+ */
+static struct au_splhead dynop[AuDyLast];
+
+static struct au_dykey *dy_gfind_get(struct au_splhead *spl, const void *h_op)
+{
+	struct au_dykey *key, *tmp;
+	struct list_head *head;
+
+	key = NULL;
+	head = &spl->head;
+	rcu_read_lock();
+	list_for_each_entry_rcu(tmp, head, dk_list)
+		if (tmp->dk_op.dy_hop == h_op) {
+			key = tmp;
+			kref_get(&key->dk_kref);
+			break;
+		}
+	rcu_read_unlock();
+
+	return key;
+}
+
+static struct au_dykey *dy_bradd(struct au_branch *br, struct au_dykey *key)
+{
+	struct au_dykey **k, *found;
+	const void *h_op = key->dk_op.dy_hop;
+	int i;
+
+	found = NULL;
+	k = br->br_dykey;
+	for (i = 0; i < AuBrDynOp; i++)
+		if (k[i]) {
+			if (k[i]->dk_op.dy_hop == h_op) {
+				found = k[i];
+				break;
+			}
+		} else
+			break;
+	if (!found) {
+		spin_lock(&br->br_dykey_lock);
+		for (; i < AuBrDynOp; i++)
+			if (k[i]) {
+				if (k[i]->dk_op.dy_hop == h_op) {
+					found = k[i];
+					break;
+				}
+			} else {
+				k[i] = key;
+				break;
+			}
+		spin_unlock(&br->br_dykey_lock);
+		BUG_ON(i == AuBrDynOp); /* expand the array */
+	}
+
+	return found;
+}
+
+/* kref_get() if @key is already added */
+static struct au_dykey *dy_gadd(struct au_splhead *spl, struct au_dykey *key)
+{
+	struct au_dykey *tmp, *found;
+	struct list_head *head;
+	const void *h_op = key->dk_op.dy_hop;
+
+	found = NULL;
+	head = &spl->head;
+	spin_lock(&spl->spin);
+	list_for_each_entry(tmp, head, dk_list)
+		if (tmp->dk_op.dy_hop == h_op) {
+			kref_get(&tmp->dk_kref);
+			found = tmp;
+			break;
+		}
+	if (!found)
+		list_add_rcu(&key->dk_list, head);
+	spin_unlock(&spl->spin);
+
+	if (!found)
+		DyPrSym(key);
+	return found;
+}
+
+static void dy_free_rcu(struct rcu_head *rcu)
+{
+	struct au_dykey *key;
+
+	key = container_of(rcu, struct au_dykey, dk_rcu);
+	DyPrSym(key);
+	kfree(key);
+}
+
+static void dy_free(struct kref *kref)
+{
+	struct au_dykey *key;
+	struct au_splhead *spl;
+
+	key = container_of(kref, struct au_dykey, dk_kref);
+	spl = dynop + key->dk_op.dy_type;
+	au_spl_del_rcu(&key->dk_list, spl);
+	call_rcu(&key->dk_rcu, dy_free_rcu);
+}
+
+void au_dy_put(struct au_dykey *key)
+{
+	kref_put(&key->dk_kref, dy_free);
+}
+
+/* ---------------------------------------------------------------------- */
+
+#define DyDbgSize(cnt, op)	AuDebugOn(cnt != sizeof(op)/sizeof(void *))
+
+#ifdef CONFIG_AUFS_DEBUG
+#define DyDbgDeclare(cnt)	unsigned int cnt = 0
+#define DyDbgInc(cnt)		do { cnt++; } while (0)
+#else
+#define DyDbgDeclare(cnt)	do {} while (0)
+#define DyDbgInc(cnt)		do {} while (0)
+#endif
+
+#define DySet(func, dst, src, h_op, h_sb) do {				\
+	DyDbgInc(cnt);							\
+	if (h_op->func) {						\
+		if (src.func)						\
+			dst.func = src.func;				\
+		else							\
+			AuDbg("%s %s\n", au_sbtype(h_sb), #func);	\
+	}								\
+} while (0)
+
+#define DySetForce(func, dst, src) do {		\
+	AuDebugOn(!src.func);			\
+	DyDbgInc(cnt);				\
+	dst.func = src.func;			\
+} while (0)
+
+#define DySetAop(func) \
+	DySet(func, dyaop->da_op, aufs_aop, h_aop, h_sb)
+#define DySetAopForce(func) \
+	DySetForce(func, dyaop->da_op, aufs_aop)
+
+static void dy_aop(struct au_dykey *key, const void *h_op,
+		   struct super_block *h_sb __maybe_unused)
+{
+	struct au_dyaop *dyaop = (void *)key;
+	const struct address_space_operations *h_aop = h_op;
+	DyDbgDeclare(cnt);
+
+	AuDbg("%s\n", au_sbtype(h_sb));
+
+	DySetAop(writepage);
+	DySetAopForce(readpage);	/* force */
+	DySetAop(writepages);
+	DySetAop(set_page_dirty);
+	DySetAop(readpages);
+	DySetAop(write_begin);
+	DySetAop(write_end);
+	DySetAop(bmap);
+	DySetAop(invalidatepage);
+	DySetAop(releasepage);
+	DySetAop(freepage);
+	/* this one will be changed according to an aufs mount option */
+	DySetAop(direct_IO);
+	DySetAop(migratepage);
+	DySetAop(launder_page);
+	DySetAop(is_partially_uptodate);
+	DySetAop(is_dirty_writeback);
+	DySetAop(error_remove_page);
+	DySetAop(swap_activate);
+	DySetAop(swap_deactivate);
+
+	DyDbgSize(cnt, *h_aop);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void dy_bug(struct kref *kref)
+{
+	BUG();
+}
+
+static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br)
+{
+	struct au_dykey *key, *old;
+	struct au_splhead *spl;
+	struct op {
+		unsigned int sz;
+		void (*set)(struct au_dykey *key, const void *h_op,
+			    struct super_block *h_sb __maybe_unused);
+	};
+	static const struct op a[] = {
+		[AuDy_AOP] = {
+			.sz	= sizeof(struct au_dyaop),
+			.set	= dy_aop
+		}
+	};
+	const struct op *p;
+
+	spl = dynop + op->dy_type;
+	key = dy_gfind_get(spl, op->dy_hop);
+	if (key)
+		goto out_add; /* success */
+
+	p = a + op->dy_type;
+	key = kzalloc(p->sz, GFP_NOFS);
+	if (unlikely(!key)) {
+		key = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	key->dk_op.dy_hop = op->dy_hop;
+	kref_init(&key->dk_kref);
+	p->set(key, op->dy_hop, au_br_sb(br));
+	old = dy_gadd(spl, key);
+	if (old) {
+		kfree(key);
+		key = old;
+	}
+
+out_add:
+	old = dy_bradd(br, key);
+	if (old)
+		/* its ref-count should never be zero here */
+		kref_put(&key->dk_kref, dy_bug);
+out:
+	return key;
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * Aufs prohibits O_DIRECT by defaut even if the branch supports it.
+ * This behaviour is necessary to return an error from open(O_DIRECT) instead
+ * of the succeeding I/O. The dio mount option enables O_DIRECT and makes
+ * open(O_DIRECT) always succeed, but the succeeding I/O may return an error.
+ * See the aufs manual in detail.
+ */
+static void dy_adx(struct au_dyaop *dyaop, int do_dx)
+{
+	if (!do_dx)
+		dyaop->da_op.direct_IO = NULL;
+	else
+		dyaop->da_op.direct_IO = aufs_aop.direct_IO;
+}
+
+static struct au_dyaop *dy_aget(struct au_branch *br,
+				const struct address_space_operations *h_aop,
+				int do_dx)
+{
+	struct au_dyaop *dyaop;
+	struct au_dynop op;
+
+	op.dy_type = AuDy_AOP;
+	op.dy_haop = h_aop;
+	dyaop = (void *)dy_get(&op, br);
+	if (IS_ERR(dyaop))
+		goto out;
+	dy_adx(dyaop, do_dx);
+
+out:
+	return dyaop;
+}
+
+int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex,
+		struct inode *h_inode)
+{
+	int err, do_dx;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct au_dyaop *dyaop;
+
+	AuDebugOn(!S_ISREG(h_inode->i_mode));
+	IiMustWriteLock(inode);
+
+	sb = inode->i_sb;
+	br = au_sbr(sb, bindex);
+	do_dx = !!au_opt_test(au_mntflags(sb), DIO);
+	dyaop = dy_aget(br, h_inode->i_mapping->a_ops, do_dx);
+	err = PTR_ERR(dyaop);
+	if (IS_ERR(dyaop))
+		/* unnecessary to call dy_fput() */
+		goto out;
+
+	err = 0;
+	inode->i_mapping->a_ops = &dyaop->da_op;
+
+out:
+	return err;
+}
+
+/*
+ * Is it safe to replace a_ops during the inode/file is in operation?
+ * Yes, I hope so.
+ */
+int au_dy_irefresh(struct inode *inode)
+{
+	int err;
+	aufs_bindex_t bstart;
+	struct inode *h_inode;
+
+	err = 0;
+	if (S_ISREG(inode->i_mode)) {
+		bstart = au_ibstart(inode);
+		h_inode = au_h_iptr(inode, bstart);
+		err = au_dy_iaop(inode, bstart, h_inode);
+	}
+	return err;
+}
+
+void au_dy_arefresh(int do_dx)
+{
+	struct au_splhead *spl;
+	struct list_head *head;
+	struct au_dykey *key;
+
+	spl = dynop + AuDy_AOP;
+	head = &spl->head;
+	spin_lock(&spl->spin);
+	list_for_each_entry(key, head, dk_list)
+		dy_adx((void *)key, do_dx);
+	spin_unlock(&spl->spin);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void __init au_dy_init(void)
+{
+	int i;
+
+	/* make sure that 'struct au_dykey *' can be any type */
+	BUILD_BUG_ON(offsetof(struct au_dyaop, da_key));
+
+	for (i = 0; i < AuDyLast; i++)
+		au_spl_init(dynop + i);
+}
+
+void au_dy_fin(void)
+{
+	int i;
+
+	for (i = 0; i < AuDyLast; i++)
+		WARN_ON(!list_empty(&dynop[i].head));
+}
diff --git b/fs/aufs/dynop.h b/fs/aufs/dynop.h
new file mode 100644
index 0000000..8680bfc
--- /dev/null
+++ b/fs/aufs/dynop.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2010-2016 Junjiro R. Okajima
+ */
+
+/*
+ * dynamically customizable operations (for regular files only)
+ */
+
+#ifndef __AUFS_DYNOP_H__
+#define __AUFS_DYNOP_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+#include <linux/kref.h>
+
+enum {AuDy_AOP, AuDyLast};
+
+struct au_dynop {
+	int						dy_type;
+	union {
+		const void				*dy_hop;
+		const struct address_space_operations	*dy_haop;
+	};
+};
+
+struct au_dykey {
+	union {
+		struct list_head	dk_list;
+		struct rcu_head		dk_rcu;
+	};
+	struct au_dynop		dk_op;
+
+	/*
+	 * during I am in the branch local array, kref is gotten. when the
+	 * branch is removed, kref is put.
+	 */
+	struct kref		dk_kref;
+};
+
+/* stop unioning since their sizes are very different from each other */
+struct au_dyaop {
+	struct au_dykey			da_key;
+	struct address_space_operations	da_op; /* not const */
+};
+
+/* ---------------------------------------------------------------------- */
+
+/* dynop.c */
+struct au_branch;
+void au_dy_put(struct au_dykey *key);
+int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex,
+		struct inode *h_inode);
+int au_dy_irefresh(struct inode *inode);
+void au_dy_arefresh(int do_dio);
+
+void __init au_dy_init(void);
+void au_dy_fin(void);
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_DYNOP_H__ */
diff --git b/fs/aufs/export.c b/fs/aufs/export.c
new file mode 100644
index 0000000..69b70f2
--- /dev/null
+++ b/fs/aufs/export.c
@@ -0,0 +1,817 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * export via nfs
+ */
+
+#include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/nsproxy.h>
+#include <linux/random.h>
+#include <linux/writeback.h>
+#include "../fs/mount.h"
+#include "aufs.h"
+
+union conv {
+#ifdef CONFIG_AUFS_INO_T_64
+	__u32 a[2];
+#else
+	__u32 a[1];
+#endif
+	ino_t ino;
+};
+
+static ino_t decode_ino(__u32 *a)
+{
+	union conv u;
+
+	BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a));
+	u.a[0] = a[0];
+#ifdef CONFIG_AUFS_INO_T_64
+	u.a[1] = a[1];
+#endif
+	return u.ino;
+}
+
+static void encode_ino(__u32 *a, ino_t ino)
+{
+	union conv u;
+
+	u.ino = ino;
+	a[0] = u.a[0];
+#ifdef CONFIG_AUFS_INO_T_64
+	a[1] = u.a[1];
+#endif
+}
+
+/* NFS file handle */
+enum {
+	Fh_br_id,
+	Fh_sigen,
+#ifdef CONFIG_AUFS_INO_T_64
+	/* support 64bit inode number */
+	Fh_ino1,
+	Fh_ino2,
+	Fh_dir_ino1,
+	Fh_dir_ino2,
+#else
+	Fh_ino1,
+	Fh_dir_ino1,
+#endif
+	Fh_igen,
+	Fh_h_type,
+	Fh_tail,
+
+	Fh_ino = Fh_ino1,
+	Fh_dir_ino = Fh_dir_ino1
+};
+
+static int au_test_anon(struct dentry *dentry)
+{
+	/* note: read d_flags without d_lock */
+	return !!(dentry->d_flags & DCACHE_DISCONNECTED);
+}
+
+int au_test_nfsd(void)
+{
+	int ret;
+	struct task_struct *tsk = current;
+	char comm[sizeof(tsk->comm)];
+
+	ret = 0;
+	if (tsk->flags & PF_KTHREAD) {
+		get_task_comm(comm, tsk);
+		ret = !strcmp(comm, "nfsd");
+	}
+
+	return ret;
+}
+
+/* ---------------------------------------------------------------------- */
+/* inode generation external table */
+
+void au_xigen_inc(struct inode *inode)
+{
+	loff_t pos;
+	ssize_t sz;
+	__u32 igen;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+
+	sb = inode->i_sb;
+	AuDebugOn(!au_opt_test(au_mntflags(sb), XINO));
+
+	sbinfo = au_sbi(sb);
+	pos = inode->i_ino;
+	pos *= sizeof(igen);
+	igen = inode->i_generation + 1;
+	sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen,
+			 sizeof(igen), &pos);
+	if (sz == sizeof(igen))
+		return; /* success */
+
+	if (unlikely(sz >= 0))
+		AuIOErr("xigen error (%zd)\n", sz);
+}
+
+int au_xigen_new(struct inode *inode)
+{
+	int err;
+	loff_t pos;
+	ssize_t sz;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+	struct file *file;
+
+	err = 0;
+	/* todo: dirty, at mount time */
+	if (inode->i_ino == AUFS_ROOT_INO)
+		goto out;
+	sb = inode->i_sb;
+	SiMustAnyLock(sb);
+	if (unlikely(!au_opt_test(au_mntflags(sb), XINO)))
+		goto out;
+
+	err = -EFBIG;
+	pos = inode->i_ino;
+	if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) {
+		AuIOErr1("too large i%lld\n", pos);
+		goto out;
+	}
+	pos *= sizeof(inode->i_generation);
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	file = sbinfo->si_xigen;
+	BUG_ON(!file);
+
+	if (vfsub_f_size_read(file)
+	    < pos + sizeof(inode->i_generation)) {
+		inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next);
+		sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation,
+				 sizeof(inode->i_generation), &pos);
+	} else
+		sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation,
+				sizeof(inode->i_generation), &pos);
+	if (sz == sizeof(inode->i_generation))
+		goto out; /* success */
+
+	err = sz;
+	if (unlikely(sz >= 0)) {
+		err = -EIO;
+		AuIOErr("xigen error (%zd)\n", sz);
+	}
+
+out:
+	return err;
+}
+
+int au_xigen_set(struct super_block *sb, struct file *base)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+	struct file *file;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	file = au_xino_create2(base, sbinfo->si_xigen);
+	err = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+	err = 0;
+	if (sbinfo->si_xigen)
+		fput(sbinfo->si_xigen);
+	sbinfo->si_xigen = file;
+
+out:
+	return err;
+}
+
+void au_xigen_clr(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	if (sbinfo->si_xigen) {
+		fput(sbinfo->si_xigen);
+		sbinfo->si_xigen = NULL;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino,
+				    ino_t dir_ino)
+{
+	struct dentry *dentry, *d;
+	struct inode *inode;
+	unsigned int sigen;
+
+	dentry = NULL;
+	inode = ilookup(sb, ino);
+	if (!inode)
+		goto out;
+
+	dentry = ERR_PTR(-ESTALE);
+	sigen = au_sigen(sb);
+	if (unlikely(is_bad_inode(inode)
+		     || IS_DEADDIR(inode)
+		     || sigen != au_iigen(inode, NULL)))
+		goto out_iput;
+
+	dentry = NULL;
+	if (!dir_ino || S_ISDIR(inode->i_mode))
+		dentry = d_find_alias(inode);
+	else {
+		spin_lock(&inode->i_lock);
+		hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) {
+			spin_lock(&d->d_lock);
+			if (!au_test_anon(d)
+			    && d_inode(d->d_parent)->i_ino == dir_ino) {
+				dentry = dget_dlock(d);
+				spin_unlock(&d->d_lock);
+				break;
+			}
+			spin_unlock(&d->d_lock);
+		}
+		spin_unlock(&inode->i_lock);
+	}
+	if (unlikely(dentry && au_digen_test(dentry, sigen))) {
+		/* need to refresh */
+		dput(dentry);
+		dentry = NULL;
+	}
+
+out_iput:
+	iput(inode);
+out:
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* todo: dirty? */
+/* if exportfs_decode_fh() passed vfsmount*, we could be happy */
+
+struct au_compare_mnt_args {
+	/* input */
+	struct super_block *sb;
+
+	/* output */
+	struct vfsmount *mnt;
+};
+
+static int au_compare_mnt(struct vfsmount *mnt, void *arg)
+{
+	struct au_compare_mnt_args *a = arg;
+
+	if (mnt->mnt_sb != a->sb)
+		return 0;
+	a->mnt = mntget(mnt);
+	return 1;
+}
+
+static struct vfsmount *au_mnt_get(struct super_block *sb)
+{
+	int err;
+	struct path root;
+	struct au_compare_mnt_args args = {
+		.sb = sb
+	};
+
+	get_fs_root(current->fs, &root);
+	rcu_read_lock();
+	err = iterate_mounts(au_compare_mnt, &args, root.mnt);
+	rcu_read_unlock();
+	path_put(&root);
+	AuDebugOn(!err);
+	AuDebugOn(!args.mnt);
+	return args.mnt;
+}
+
+struct au_nfsd_si_lock {
+	unsigned int sigen;
+	aufs_bindex_t bindex, br_id;
+	unsigned char force_lock;
+};
+
+static int si_nfsd_read_lock(struct super_block *sb,
+			     struct au_nfsd_si_lock *nsi_lock)
+{
+	int err;
+	aufs_bindex_t bindex;
+
+	si_read_lock(sb, AuLock_FLUSH);
+
+	/* branch id may be wrapped around */
+	err = 0;
+	bindex = au_br_index(sb, nsi_lock->br_id);
+	if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb))
+		goto out; /* success */
+
+	err = -ESTALE;
+	bindex = -1;
+	if (!nsi_lock->force_lock)
+		si_read_unlock(sb);
+
+out:
+	nsi_lock->bindex = bindex;
+	return err;
+}
+
+struct find_name_by_ino {
+	struct dir_context ctx;
+	int called, found;
+	ino_t ino;
+	char *name;
+	int namelen;
+};
+
+static int
+find_name_by_ino(struct dir_context *ctx, const char *name, int namelen,
+		 loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct find_name_by_ino *a = container_of(ctx, struct find_name_by_ino,
+						  ctx);
+
+	a->called++;
+	if (a->ino != ino)
+		return 0;
+
+	memcpy(a->name, name, namelen);
+	a->namelen = namelen;
+	a->found = 1;
+	return 1;
+}
+
+static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino,
+				     struct au_nfsd_si_lock *nsi_lock)
+{
+	struct dentry *dentry, *parent;
+	struct file *file;
+	struct inode *dir;
+	struct find_name_by_ino arg = {
+		.ctx = {
+			.actor = find_name_by_ino
+		}
+	};
+	int err;
+
+	parent = path->dentry;
+	if (nsi_lock)
+		si_read_unlock(parent->d_sb);
+	file = vfsub_dentry_open(path, au_dir_roflags);
+	dentry = (void *)file;
+	if (IS_ERR(file))
+		goto out;
+
+	dentry = ERR_PTR(-ENOMEM);
+	arg.name = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!arg.name))
+		goto out_file;
+	arg.ino = ino;
+	arg.found = 0;
+	do {
+		arg.called = 0;
+		/* smp_mb(); */
+		err = vfsub_iterate_dir(file, &arg.ctx);
+	} while (!err && !arg.found && arg.called);
+	dentry = ERR_PTR(err);
+	if (unlikely(err))
+		goto out_name;
+	/* instead of ENOENT */
+	dentry = ERR_PTR(-ESTALE);
+	if (!arg.found)
+		goto out_name;
+
+	/* do not call vfsub_lkup_one() */
+	dir = d_inode(parent);
+	dentry = vfsub_lookup_one_len_unlocked(arg.name, parent, arg.namelen);
+	AuTraceErrPtr(dentry);
+	if (IS_ERR(dentry))
+		goto out_name;
+	AuDebugOn(au_test_anon(dentry));
+	if (unlikely(d_really_is_negative(dentry))) {
+		dput(dentry);
+		dentry = ERR_PTR(-ENOENT);
+	}
+
+out_name:
+	free_page((unsigned long)arg.name);
+out_file:
+	fput(file);
+out:
+	if (unlikely(nsi_lock
+		     && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0))
+		if (!IS_ERR(dentry)) {
+			dput(dentry);
+			dentry = ERR_PTR(-ESTALE);
+		}
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino,
+					ino_t dir_ino,
+					struct au_nfsd_si_lock *nsi_lock)
+{
+	struct dentry *dentry;
+	struct path path;
+
+	if (dir_ino != AUFS_ROOT_INO) {
+		path.dentry = decode_by_ino(sb, dir_ino, 0);
+		dentry = path.dentry;
+		if (!path.dentry || IS_ERR(path.dentry))
+			goto out;
+		AuDebugOn(au_test_anon(path.dentry));
+	} else
+		path.dentry = dget(sb->s_root);
+
+	path.mnt = au_mnt_get(sb);
+	dentry = au_lkup_by_ino(&path, ino, nsi_lock);
+	path_put(&path);
+
+out:
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int h_acceptable(void *expv, struct dentry *dentry)
+{
+	return 1;
+}
+
+static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath,
+			   char *buf, int len, struct super_block *sb)
+{
+	char *p;
+	int n;
+	struct path path;
+
+	p = d_path(h_rootpath, buf, len);
+	if (IS_ERR(p))
+		goto out;
+	n = strlen(p);
+
+	path.mnt = h_rootpath->mnt;
+	path.dentry = h_parent;
+	p = d_path(&path, buf, len);
+	if (IS_ERR(p))
+		goto out;
+	if (n != 1)
+		p += n;
+
+	path.mnt = au_mnt_get(sb);
+	path.dentry = sb->s_root;
+	p = d_path(&path, buf, len - strlen(p));
+	mntput(path.mnt);
+	if (IS_ERR(p))
+		goto out;
+	if (n != 1)
+		p[strlen(p)] = '/';
+
+out:
+	AuTraceErrPtr(p);
+	return p;
+}
+
+static
+struct dentry *decode_by_path(struct super_block *sb, ino_t ino, __u32 *fh,
+			      int fh_len, struct au_nfsd_si_lock *nsi_lock)
+{
+	struct dentry *dentry, *h_parent, *root;
+	struct super_block *h_sb;
+	char *pathname, *p;
+	struct vfsmount *h_mnt;
+	struct au_branch *br;
+	int err;
+	struct path path;
+
+	br = au_sbr(sb, nsi_lock->bindex);
+	h_mnt = au_br_mnt(br);
+	h_sb = h_mnt->mnt_sb;
+	/* todo: call lower fh_to_dentry()? fh_to_parent()? */
+	h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail),
+				      fh_len - Fh_tail, fh[Fh_h_type],
+				      h_acceptable, /*context*/NULL);
+	dentry = h_parent;
+	if (unlikely(!h_parent || IS_ERR(h_parent))) {
+		AuWarn1("%s decode_fh failed, %ld\n",
+			au_sbtype(h_sb), PTR_ERR(h_parent));
+		goto out;
+	}
+	dentry = NULL;
+	if (unlikely(au_test_anon(h_parent))) {
+		AuWarn1("%s decode_fh returned a disconnected dentry\n",
+			au_sbtype(h_sb));
+		goto out_h_parent;
+	}
+
+	dentry = ERR_PTR(-ENOMEM);
+	pathname = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!pathname))
+		goto out_h_parent;
+
+	root = sb->s_root;
+	path.mnt = h_mnt;
+	di_read_lock_parent(root, !AuLock_IR);
+	path.dentry = au_h_dptr(root, nsi_lock->bindex);
+	di_read_unlock(root, !AuLock_IR);
+	p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb);
+	dentry = (void *)p;
+	if (IS_ERR(p))
+		goto out_pathname;
+
+	si_read_unlock(sb);
+	err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
+	dentry = ERR_PTR(err);
+	if (unlikely(err))
+		goto out_relock;
+
+	dentry = ERR_PTR(-ENOENT);
+	AuDebugOn(au_test_anon(path.dentry));
+	if (unlikely(d_really_is_negative(path.dentry)))
+		goto out_path;
+
+	if (ino != d_inode(path.dentry)->i_ino)
+		dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL);
+	else
+		dentry = dget(path.dentry);
+
+out_path:
+	path_put(&path);
+out_relock:
+	if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0))
+		if (!IS_ERR(dentry)) {
+			dput(dentry);
+			dentry = ERR_PTR(-ESTALE);
+		}
+out_pathname:
+	free_page((unsigned long)pathname);
+out_h_parent:
+	dput(h_parent);
+out:
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct dentry *
+aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
+		  int fh_type)
+{
+	struct dentry *dentry;
+	__u32 *fh = fid->raw;
+	struct au_branch *br;
+	ino_t ino, dir_ino;
+	struct au_nfsd_si_lock nsi_lock = {
+		.force_lock	= 0
+	};
+
+	dentry = ERR_PTR(-ESTALE);
+	/* it should never happen, but the file handle is unreliable */
+	if (unlikely(fh_len < Fh_tail))
+		goto out;
+	nsi_lock.sigen = fh[Fh_sigen];
+	nsi_lock.br_id = fh[Fh_br_id];
+
+	/* branch id may be wrapped around */
+	br = NULL;
+	if (unlikely(si_nfsd_read_lock(sb, &nsi_lock)))
+		goto out;
+	nsi_lock.force_lock = 1;
+
+	/* is this inode still cached? */
+	ino = decode_ino(fh + Fh_ino);
+	/* it should never happen */
+	if (unlikely(ino == AUFS_ROOT_INO))
+		goto out_unlock;
+
+	dir_ino = decode_ino(fh + Fh_dir_ino);
+	dentry = decode_by_ino(sb, ino, dir_ino);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+	if (dentry)
+		goto accept;
+
+	/* is the parent dir cached? */
+	br = au_sbr(sb, nsi_lock.bindex);
+	atomic_inc(&br->br_count);
+	dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+	if (dentry)
+		goto accept;
+
+	/* lookup path */
+	dentry = decode_by_path(sb, ino, fh, fh_len, &nsi_lock);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+	if (unlikely(!dentry))
+		/* todo?: make it ESTALE */
+		goto out_unlock;
+
+accept:
+	if (!au_digen_test(dentry, au_sigen(sb))
+	    && d_inode(dentry)->i_generation == fh[Fh_igen])
+		goto out_unlock; /* success */
+
+	dput(dentry);
+	dentry = ERR_PTR(-ESTALE);
+out_unlock:
+	if (br)
+		atomic_dec(&br->br_count);
+	si_read_unlock(sb);
+out:
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+#if 0 /* reserved for future use */
+/* support subtreecheck option */
+static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid,
+					int fh_len, int fh_type)
+{
+	struct dentry *parent;
+	__u32 *fh = fid->raw;
+	ino_t dir_ino;
+
+	dir_ino = decode_ino(fh + Fh_dir_ino);
+	parent = decode_by_ino(sb, dir_ino, 0);
+	if (IS_ERR(parent))
+		goto out;
+	if (!parent)
+		parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]),
+					dir_ino, fh, fh_len);
+
+out:
+	AuTraceErrPtr(parent);
+	return parent;
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
+			  struct inode *dir)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct super_block *sb, *h_sb;
+	struct dentry *dentry, *parent, *h_parent;
+	struct inode *h_dir;
+	struct au_branch *br;
+
+	err = -ENOSPC;
+	if (unlikely(*max_len <= Fh_tail)) {
+		AuWarn1("NFSv2 client (max_len %d)?\n", *max_len);
+		goto out;
+	}
+
+	err = FILEID_ROOT;
+	if (inode->i_ino == AUFS_ROOT_INO) {
+		AuDebugOn(inode->i_ino != AUFS_ROOT_INO);
+		goto out;
+	}
+
+	h_parent = NULL;
+	sb = inode->i_sb;
+	err = si_read_lock(sb, AuLock_FLUSH);
+	if (unlikely(err))
+		goto out;
+
+#ifdef CONFIG_AUFS_DEBUG
+	if (unlikely(!au_opt_test(au_mntflags(sb), XINO)))
+		AuWarn1("NFS-exporting requires xino\n");
+#endif
+	err = -EIO;
+	parent = NULL;
+	ii_read_lock_child(inode);
+	bindex = au_ibstart(inode);
+	if (!dir) {
+		dentry = d_find_any_alias(inode);
+		if (unlikely(!dentry))
+			goto out_unlock;
+		AuDebugOn(au_test_anon(dentry));
+		parent = dget_parent(dentry);
+		dput(dentry);
+		if (unlikely(!parent))
+			goto out_unlock;
+		if (d_really_is_positive(parent))
+			dir = d_inode(parent);
+	}
+
+	ii_read_lock_parent(dir);
+	h_dir = au_h_iptr(dir, bindex);
+	ii_read_unlock(dir);
+	if (unlikely(!h_dir))
+		goto out_parent;
+	h_parent = d_find_any_alias(h_dir);
+	if (unlikely(!h_parent))
+		goto out_hparent;
+
+	err = -EPERM;
+	br = au_sbr(sb, bindex);
+	h_sb = au_br_sb(br);
+	if (unlikely(!h_sb->s_export_op)) {
+		AuErr1("%s branch is not exportable\n", au_sbtype(h_sb));
+		goto out_hparent;
+	}
+
+	fh[Fh_br_id] = br->br_id;
+	fh[Fh_sigen] = au_sigen(sb);
+	encode_ino(fh + Fh_ino, inode->i_ino);
+	encode_ino(fh + Fh_dir_ino, dir->i_ino);
+	fh[Fh_igen] = inode->i_generation;
+
+	*max_len -= Fh_tail;
+	fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail),
+					   max_len,
+					   /*connectable or subtreecheck*/0);
+	err = fh[Fh_h_type];
+	*max_len += Fh_tail;
+	/* todo: macros? */
+	if (err != FILEID_INVALID)
+		err = 99;
+	else
+		AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb));
+
+out_hparent:
+	dput(h_parent);
+out_parent:
+	dput(parent);
+out_unlock:
+	ii_read_unlock(inode);
+	si_read_unlock(sb);
+out:
+	if (unlikely(err < 0))
+		err = FILEID_INVALID;
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_commit_metadata(struct inode *inode)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct super_block *sb;
+	struct inode *h_inode;
+	int (*f)(struct inode *inode);
+
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+	ii_write_lock_child(inode);
+	bindex = au_ibstart(inode);
+	AuDebugOn(bindex < 0);
+	h_inode = au_h_iptr(inode, bindex);
+
+	f = h_inode->i_sb->s_export_op->commit_metadata;
+	if (f)
+		err = f(h_inode);
+	else {
+		struct writeback_control wbc = {
+			.sync_mode	= WB_SYNC_ALL,
+			.nr_to_write	= 0 /* metadata only */
+		};
+
+		err = sync_inode(h_inode, &wbc);
+	}
+
+	au_cpup_attr_timesizes(inode);
+	ii_write_unlock(inode);
+	si_read_unlock(sb);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct export_operations aufs_export_op = {
+	.fh_to_dentry		= aufs_fh_to_dentry,
+	/* .fh_to_parent	= aufs_fh_to_parent, */
+	.encode_fh		= aufs_encode_fh,
+	.commit_metadata	= aufs_commit_metadata
+};
+
+void au_export_init(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+	__u32 u;
+
+	sb->s_export_op = &aufs_export_op;
+	sbinfo = au_sbi(sb);
+	sbinfo->si_xigen = NULL;
+	get_random_bytes(&u, sizeof(u));
+	BUILD_BUG_ON(sizeof(u) != sizeof(int));
+	atomic_set(&sbinfo->si_xigen_next, u);
+}
diff --git b/fs/aufs/f_op.c b/fs/aufs/f_op.c
new file mode 100644
index 0000000..ac1304a
--- /dev/null
+++ b/fs/aufs/f_op.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * file and vm operations
+ */
+
+#include <linux/aio.h>
+#include <linux/fs_stack.h>
+#include <linux/mman.h>
+#include <linux/security.h>
+#include "aufs.h"
+
+int au_do_open_nondir(struct file *file, int flags, struct file *h_file)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct dentry *dentry, *h_dentry;
+	struct au_finfo *finfo;
+	struct inode *h_inode;
+
+	FiMustWriteLock(file);
+
+	err = 0;
+	dentry = file->f_path.dentry;
+	AuDebugOn(IS_ERR_OR_NULL(dentry));
+	finfo = au_fi(file);
+	memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop));
+	atomic_set(&finfo->fi_mmapped, 0);
+	bindex = au_dbstart(dentry);
+	if (!h_file) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb);
+		if (unlikely(err))
+			goto out;
+		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
+	} else {
+		h_dentry = h_file->f_path.dentry;
+		err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb);
+		if (unlikely(err))
+			goto out;
+		get_file(h_file);
+	}
+	if (IS_ERR(h_file))
+		err = PTR_ERR(h_file);
+	else {
+		if ((flags & __O_TMPFILE)
+		    && !(flags & O_EXCL)) {
+			h_inode = file_inode(h_file);
+			spin_lock(&h_inode->i_lock);
+			h_inode->i_state |= I_LINKABLE;
+			spin_unlock(&h_inode->i_lock);
+		}
+		au_set_fbstart(file, bindex);
+		au_set_h_fptr(file, bindex, h_file);
+		au_update_figen(file);
+		/* todo: necessary? */
+		/* file->f_ra = h_file->f_ra; */
+	}
+
+out:
+	return err;
+}
+
+static int aufs_open_nondir(struct inode *inode __maybe_unused,
+			    struct file *file)
+{
+	int err;
+	struct super_block *sb;
+	struct au_do_open_args args = {
+		.open	= au_do_open_nondir
+	};
+
+	AuDbg("%pD, f_flags 0x%x, f_mode 0x%x\n",
+	      file, vfsub_file_flags(file), file->f_mode);
+
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	err = au_do_open(file, &args);
+	si_read_unlock(sb);
+	return err;
+}
+
+int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file)
+{
+	struct au_finfo *finfo;
+	aufs_bindex_t bindex;
+
+	finfo = au_fi(file);
+	au_sphl_del(&finfo->fi_hlist,
+		    &au_sbi(file->f_path.dentry->d_sb)->si_files);
+	bindex = finfo->fi_btop;
+	if (bindex >= 0)
+		au_set_h_fptr(file, bindex, NULL);
+
+	au_finfo_fin(file);
+	return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_do_flush_nondir(struct file *file, fl_owner_t id)
+{
+	int err;
+	struct file *h_file;
+
+	err = 0;
+	h_file = au_hf_top(file);
+	if (h_file)
+		err = vfsub_flush(h_file, id);
+	return err;
+}
+
+static int aufs_flush_nondir(struct file *file, fl_owner_t id)
+{
+	return au_do_flush(file, id, au_do_flush_nondir);
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * read and write functions acquire [fdi]_rwsem once, but release before
+ * mmap_sem. This is because to stop a race condition between mmap(2).
+ * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping
+ * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in
+ * read functions after [fdi]_rwsem are released, but it should be harmless.
+ */
+
+/* Callers should call au_read_post() or fput() in the end */
+struct file *au_read_pre(struct file *file, int keep_fi)
+{
+	struct file *h_file;
+	int err;
+
+	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0);
+	if (!err) {
+		di_read_unlock(file->f_path.dentry, AuLock_IR);
+		h_file = au_hf_top(file);
+		get_file(h_file);
+		if (!keep_fi)
+			fi_read_unlock(file);
+	} else
+		h_file = ERR_PTR(err);
+
+	return h_file;
+}
+
+static void au_read_post(struct inode *inode, struct file *h_file)
+{
+	/* update without lock, I don't think it a problem */
+	fsstack_copy_attr_atime(inode, file_inode(h_file));
+	fput(h_file);
+}
+
+struct au_write_pre {
+	blkcnt_t blks;
+	aufs_bindex_t bstart;
+};
+
+/*
+ * return with iinfo is write-locked
+ * callers should call au_write_post() or iinfo_write_unlock() + fput() in the
+ * end
+ */
+static struct file *au_write_pre(struct file *file, int do_ready,
+				 struct au_write_pre *wpre)
+{
+	struct file *h_file;
+	struct dentry *dentry;
+	int err;
+	struct au_pin pin;
+
+	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
+	h_file = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	dentry = file->f_path.dentry;
+	if (do_ready) {
+		err = au_ready_to_write(file, -1, &pin);
+		if (unlikely(err)) {
+			h_file = ERR_PTR(err);
+			di_write_unlock(dentry);
+			goto out_fi;
+		}
+	}
+
+	di_downgrade_lock(dentry, /*flags*/0);
+	if (wpre)
+		wpre->bstart = au_fbstart(file);
+	h_file = au_hf_top(file);
+	get_file(h_file);
+	if (wpre)
+		wpre->blks = file_inode(h_file)->i_blocks;
+	if (do_ready)
+		au_unpin(&pin);
+	di_read_unlock(dentry, /*flags*/0);
+
+out_fi:
+	fi_write_unlock(file);
+out:
+	return h_file;
+}
+
+static void au_write_post(struct inode *inode, struct file *h_file,
+			  struct au_write_pre *wpre, ssize_t written)
+{
+	struct inode *h_inode;
+
+	au_cpup_attr_timesizes(inode);
+	AuDebugOn(au_ibstart(inode) != wpre->bstart);
+	h_inode = file_inode(h_file);
+	inode->i_mode = h_inode->i_mode;
+	ii_write_unlock(inode);
+	fput(h_file);
+
+	/* AuDbg("blks %llu, %llu\n", (u64)blks, (u64)h_inode->i_blocks); */
+	if (written > 0)
+		au_fhsm_wrote(inode->i_sb, wpre->bstart,
+			      /*force*/h_inode->i_blocks > wpre->blks);
+}
+
+static ssize_t aufs_read(struct file *file, char __user *buf, size_t count,
+			 loff_t *ppos)
+{
+	ssize_t err;
+	struct inode *inode;
+	struct file *h_file;
+	struct super_block *sb;
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	/* filedata may be obsoleted by concurrent copyup, but no problem */
+	err = vfsub_read_u(h_file, buf, count, ppos);
+	/* todo: necessary? */
+	/* file->f_ra = h_file->f_ra; */
+	au_read_post(inode, h_file);
+
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+/*
+ * todo: very ugly
+ * it locks both of i_mutex and si_rwsem for read in safe.
+ * if the plink maintenance mode continues forever (that is the problem),
+ * may loop forever.
+ */
+static void au_mtx_and_read_lock(struct inode *inode)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+
+	while (1) {
+		inode_lock(inode);
+		err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+		if (!err)
+			break;
+		inode_unlock(inode);
+		si_read_lock(sb, AuLock_NOPLMW);
+		si_read_unlock(sb);
+	}
+}
+
+static ssize_t aufs_write(struct file *file, const char __user *ubuf,
+			  size_t count, loff_t *ppos)
+{
+	ssize_t err;
+	struct au_write_pre wpre;
+	struct inode *inode;
+	struct file *h_file;
+	char __user *buf = (char __user *)ubuf;
+
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = vfsub_write_u(h_file, buf, count, ppos);
+	au_write_post(inode, h_file, &wpre, err);
+
+out:
+	si_read_unlock(inode->i_sb);
+	inode_unlock(inode);
+	return err;
+}
+
+static ssize_t au_do_iter(struct file *h_file, int rw, struct kiocb *kio,
+			  struct iov_iter *iov_iter)
+{
+	ssize_t err;
+	struct file *file;
+	ssize_t (*iter)(struct kiocb *, struct iov_iter *);
+
+	err = security_file_permission(h_file, rw);
+	if (unlikely(err))
+		goto out;
+
+	err = -ENOSYS;
+	iter = NULL;
+	if (rw == MAY_READ)
+		iter = h_file->f_op->read_iter;
+	else if (rw == MAY_WRITE)
+		iter = h_file->f_op->write_iter;
+
+	file = kio->ki_filp;
+	kio->ki_filp = h_file;
+	if (iter) {
+		lockdep_off();
+		err = iter(kio, iov_iter);
+		lockdep_on();
+	} else
+		/* currently there is no such fs */
+		WARN_ON_ONCE(1);
+	kio->ki_filp = file;
+
+out:
+	return err;
+}
+
+static ssize_t aufs_read_iter(struct kiocb *kio, struct iov_iter *iov_iter)
+{
+	ssize_t err;
+	struct file *file, *h_file;
+	struct inode *inode;
+	struct super_block *sb;
+
+	file = kio->ki_filp;
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = au_do_iter(h_file, MAY_READ, kio, iov_iter);
+	/* todo: necessary? */
+	/* file->f_ra = h_file->f_ra; */
+	au_read_post(inode, h_file);
+
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+static ssize_t aufs_write_iter(struct kiocb *kio, struct iov_iter *iov_iter)
+{
+	ssize_t err;
+	struct au_write_pre wpre;
+	struct inode *inode;
+	struct file *file, *h_file;
+
+	file = kio->ki_filp;
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = au_do_iter(h_file, MAY_WRITE, kio, iov_iter);
+	au_write_post(inode, h_file, &wpre, err);
+
+out:
+	si_read_unlock(inode->i_sb);
+	inode_unlock(inode);
+	return err;
+}
+
+static ssize_t aufs_splice_read(struct file *file, loff_t *ppos,
+				struct pipe_inode_info *pipe, size_t len,
+				unsigned int flags)
+{
+	ssize_t err;
+	struct file *h_file;
+	struct inode *inode;
+	struct super_block *sb;
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/1);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	if (au_test_loopback_kthread()) {
+		au_warn_loopback(h_file->f_path.dentry->d_sb);
+		if (file->f_mapping != h_file->f_mapping) {
+			file->f_mapping = h_file->f_mapping;
+			smp_mb(); /* unnecessary? */
+		}
+	}
+	fi_read_unlock(file);
+
+	err = vfsub_splice_to(h_file, ppos, pipe, len, flags);
+	/* todo: necessasry? */
+	/* file->f_ra = h_file->f_ra; */
+	au_read_post(inode, h_file);
+
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+static ssize_t
+aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos,
+		  size_t len, unsigned int flags)
+{
+	ssize_t err;
+	struct au_write_pre wpre;
+	struct inode *inode;
+	struct file *h_file;
+
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = vfsub_splice_from(pipe, h_file, ppos, len, flags);
+	au_write_post(inode, h_file, &wpre, err);
+
+out:
+	si_read_unlock(inode->i_sb);
+	inode_unlock(inode);
+	return err;
+}
+
+static long aufs_fallocate(struct file *file, int mode, loff_t offset,
+			   loff_t len)
+{
+	long err;
+	struct au_write_pre wpre;
+	struct inode *inode;
+	struct file *h_file;
+
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	lockdep_off();
+	err = vfs_fallocate(h_file, mode, offset, len);
+	lockdep_on();
+	au_write_post(inode, h_file, &wpre, /*written*/1);
+
+out:
+	si_read_unlock(inode->i_sb);
+	inode_unlock(inode);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * The locking order around current->mmap_sem.
+ * - in most and regular cases
+ *   file I/O syscall -- aufs_read() or something
+ *	-- si_rwsem for read -- mmap_sem
+ *	(Note that [fdi]i_rwsem are released before mmap_sem).
+ * - in mmap case
+ *   mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem
+ * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for
+ * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in
+ * file I/O. Aufs needs to stop lockdep in aufs_mmap() though.
+ * It means that when aufs acquires si_rwsem for write, the process should never
+ * acquire mmap_sem.
+ *
+ * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a
+ * problem either since any directory is not able to be mmap-ed.
+ * The similar scenario is applied to aufs_readlink() too.
+ */
+
+#if 0 /* stop calling security_file_mmap() */
+/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */
+#define AuConv_VM_PROT(f, b)	_calc_vm_trans(f, VM_##b, PROT_##b)
+
+static unsigned long au_arch_prot_conv(unsigned long flags)
+{
+	/* currently ppc64 only */
+#ifdef CONFIG_PPC64
+	/* cf. linux/arch/powerpc/include/asm/mman.h */
+	AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO);
+	return AuConv_VM_PROT(flags, SAO);
+#else
+	AuDebugOn(arch_calc_vm_prot_bits(-1));
+	return 0;
+#endif
+}
+
+static unsigned long au_prot_conv(unsigned long flags)
+{
+	return AuConv_VM_PROT(flags, READ)
+		| AuConv_VM_PROT(flags, WRITE)
+		| AuConv_VM_PROT(flags, EXEC)
+		| au_arch_prot_conv(flags);
+}
+
+/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */
+#define AuConv_VM_MAP(f, b)	_calc_vm_trans(f, VM_##b, MAP_##b)
+
+static unsigned long au_flag_conv(unsigned long flags)
+{
+	return AuConv_VM_MAP(flags, GROWSDOWN)
+		| AuConv_VM_MAP(flags, DENYWRITE)
+		| AuConv_VM_MAP(flags, LOCKED);
+}
+#endif
+
+static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int err;
+	const unsigned char wlock
+		= (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
+	struct super_block *sb;
+	struct file *h_file;
+	struct inode *inode;
+
+	AuDbgVmRegion(file, vma);
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	lockdep_off();
+	si_read_lock(sb, AuLock_NOPLMW);
+
+	h_file = au_write_pre(file, wlock, /*wpre*/NULL);
+	lockdep_on();
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = 0;
+	au_set_mmapped(file);
+	au_vm_file_reset(vma, h_file);
+	/*
+	 * we cannot call security_mmap_file() here since it may acquire
+	 * mmap_sem or i_mutex.
+	 *
+	 * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags),
+	 *			 au_flag_conv(vma->vm_flags));
+	 */
+	if (!err)
+		err = h_file->f_op->mmap(h_file, vma);
+	if (!err) {
+		au_vm_prfile_set(vma, file);
+		fsstack_copy_attr_atime(inode, file_inode(h_file));
+		goto out_fput; /* success */
+	}
+	au_unset_mmapped(file);
+	au_vm_file_reset(vma, file);
+
+out_fput:
+	lockdep_off();
+	ii_write_unlock(inode);
+	lockdep_on();
+	fput(h_file);
+out:
+	lockdep_off();
+	si_read_unlock(sb);
+	lockdep_on();
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end,
+			     int datasync)
+{
+	int err;
+	struct au_write_pre wpre;
+	struct inode *inode;
+	struct file *h_file;
+
+	err = 0; /* -EBADF; */ /* posix? */
+	if (unlikely(!(file->f_mode & FMODE_WRITE)))
+		goto out;
+
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out_unlock;
+
+	err = vfsub_fsync(h_file, &h_file->f_path, datasync);
+	au_write_post(inode, h_file, &wpre, /*written*/0);
+
+out_unlock:
+	si_read_unlock(inode->i_sb);
+	inode_unlock(inode);
+out:
+	return err;
+}
+
+/* no one supports this operation, currently */
+#if 0
+static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync)
+{
+	int err;
+	struct au_write_pre wpre;
+	struct inode *inode, *h_inode;
+	struct file *file, *h_file;
+
+	err = 0; /* -EBADF; */ /* posix? */
+	if (unlikely(!(file->f_mode & FMODE_WRITE)))
+		goto out;
+
+	file = kio->ki_filp;
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out_unlock;
+
+	err = -ENOSYS;
+	h_file = au_hf_top(file);
+	if (h_file->f_op->aio_fsync) {
+		h_inode = file_inode(h_file);
+		if (!is_sync_kiocb(kio)) {
+			get_file(h_file);
+			fput(file);
+		}
+		kio->ki_filp = h_file;
+		err = h_file->f_op->aio_fsync(kio, datasync);
+		inode_lock_nested(h_inode, AuLsc_I_CHILD);
+		if (!err)
+			vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL);
+		/*ignore*/
+		inode_unlock(h_inode);
+	}
+	au_write_post(inode, h_file, &wpre, /*written*/0);
+
+out_unlock:
+	si_read_unlock(inode->sb);
+	inode_unlock(inode);
+out:
+	return err;
+}
+#endif
+
+static int aufs_fasync(int fd, struct file *file, int flag)
+{
+	int err;
+	struct file *h_file;
+	struct super_block *sb;
+
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	if (h_file->f_op->fasync)
+		err = h_file->f_op->fasync(fd, h_file, flag);
+	fput(h_file); /* instead of au_read_post() */
+
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+static int aufs_setfl(struct file *file, unsigned long arg)
+{
+	int err;
+	struct file *h_file;
+	struct super_block *sb;
+
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	arg |= vfsub_file_flags(file) & FASYNC; /* stop calling h_file->fasync */
+	err = setfl(/*unused fd*/-1, h_file, arg);
+	fput(h_file); /* instead of au_read_post() */
+
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* no one supports this operation, currently */
+#if 0
+static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset,
+			     size_t len, loff_t *pos, int more)
+{
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+const struct file_operations aufs_file_fop = {
+	.owner		= THIS_MODULE,
+
+	.llseek		= default_llseek,
+
+	.read		= aufs_read,
+	.write		= aufs_write,
+	.read_iter	= aufs_read_iter,
+	.write_iter	= aufs_write_iter,
+
+#ifdef CONFIG_AUFS_POLL
+	.poll		= aufs_poll,
+#endif
+	.unlocked_ioctl	= aufs_ioctl_nondir,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= aufs_compat_ioctl_nondir,
+#endif
+	.mmap		= aufs_mmap,
+	.open		= aufs_open_nondir,
+	.flush		= aufs_flush_nondir,
+	.release	= aufs_release_nondir,
+	.fsync		= aufs_fsync_nondir,
+	/* .aio_fsync	= aufs_aio_fsync_nondir, */
+	.fasync		= aufs_fasync,
+	/* .sendpage	= aufs_sendpage, */
+	.setfl		= aufs_setfl,
+	.splice_write	= aufs_splice_write,
+	.splice_read	= aufs_splice_read,
+#if 0
+	.aio_splice_write = aufs_aio_splice_write,
+	.aio_splice_read  = aufs_aio_splice_read,
+#endif
+	.fallocate	= aufs_fallocate
+};
diff --git b/fs/aufs/fhsm.c b/fs/aufs/fhsm.c
new file mode 100644
index 0000000..db079d6
--- /dev/null
+++ b/fs/aufs/fhsm.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright (C) 2011-2016 Junjiro R. Okajima
+ */
+
+/*
+ * File-based Hierarchy Storage Management
+ */
+
+#include <linux/anon_inodes.h>
+#include <linux/poll.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include "aufs.h"
+
+static aufs_bindex_t au_fhsm_bottom(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	SiMustAnyLock(sb);
+
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	AuDebugOn(!fhsm);
+	return fhsm->fhsm_bottom;
+}
+
+void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex)
+{
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	AuDebugOn(!fhsm);
+	fhsm->fhsm_bottom = bindex;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_fhsm_test_jiffy(struct au_sbinfo *sbinfo, struct au_branch *br)
+{
+	struct au_br_fhsm *bf;
+
+	bf = br->br_fhsm;
+	MtxMustLock(&bf->bf_lock);
+
+	return !bf->bf_readable
+		|| time_after(jiffies,
+			      bf->bf_jiffy + sbinfo->si_fhsm.fhsm_expire);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void au_fhsm_notify(struct super_block *sb, int val)
+{
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	SiMustAnyLock(sb);
+
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	if (au_fhsm_pid(fhsm)
+	    && atomic_read(&fhsm->fhsm_readable) != -1) {
+		atomic_set(&fhsm->fhsm_readable, val);
+		if (val)
+			wake_up(&fhsm->fhsm_wqh);
+	}
+}
+
+static int au_fhsm_stfs(struct super_block *sb, aufs_bindex_t bindex,
+			struct aufs_stfs *rstfs, int do_lock, int do_notify)
+{
+	int err;
+	struct au_branch *br;
+	struct au_br_fhsm *bf;
+
+	br = au_sbr(sb, bindex);
+	AuDebugOn(au_br_rdonly(br));
+	bf = br->br_fhsm;
+	AuDebugOn(!bf);
+
+	if (do_lock)
+		mutex_lock(&bf->bf_lock);
+	else
+		MtxMustLock(&bf->bf_lock);
+
+	/* sb->s_root for NFS is unreliable */
+	err = au_br_stfs(br, &bf->bf_stfs);
+	if (unlikely(err)) {
+		AuErr1("FHSM failed (%d), b%d, ignored.\n", bindex, err);
+		goto out;
+	}
+
+	bf->bf_jiffy = jiffies;
+	bf->bf_readable = 1;
+	if (do_notify)
+		au_fhsm_notify(sb, /*val*/1);
+	if (rstfs)
+		*rstfs = bf->bf_stfs;
+
+out:
+	if (do_lock)
+		mutex_unlock(&bf->bf_lock);
+	au_fhsm_notify(sb, /*val*/1);
+
+	return err;
+}
+
+void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+	struct au_branch *br;
+	struct au_br_fhsm *bf;
+
+	AuDbg("b%d, force %d\n", bindex, force);
+	SiMustAnyLock(sb);
+
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	if (!au_ftest_si(sbinfo, FHSM)
+	    || fhsm->fhsm_bottom == bindex)
+		return;
+
+	br = au_sbr(sb, bindex);
+	bf = br->br_fhsm;
+	AuDebugOn(!bf);
+	mutex_lock(&bf->bf_lock);
+	if (force
+	    || au_fhsm_pid(fhsm)
+	    || au_fhsm_test_jiffy(sbinfo, br))
+		err = au_fhsm_stfs(sb, bindex, /*rstfs*/NULL, /*do_lock*/0,
+				  /*do_notify*/1);
+	mutex_unlock(&bf->bf_lock);
+}
+
+void au_fhsm_wrote_all(struct super_block *sb, int force)
+{
+	aufs_bindex_t bindex, bend;
+	struct au_branch *br;
+
+	/* exclude the bottom */
+	bend = au_fhsm_bottom(sb);
+	for (bindex = 0; bindex < bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		if (au_br_fhsm(br->br_perm))
+			au_fhsm_wrote(sb, bindex, force);
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+static unsigned int au_fhsm_poll(struct file *file,
+				 struct poll_table_struct *wait)
+{
+	unsigned int mask;
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	mask = 0;
+	sbinfo = file->private_data;
+	fhsm = &sbinfo->si_fhsm;
+	poll_wait(file, &fhsm->fhsm_wqh, wait);
+	if (atomic_read(&fhsm->fhsm_readable))
+		mask = POLLIN /* | POLLRDNORM */;
+
+	AuTraceErr((int)mask);
+	return mask;
+}
+
+static int au_fhsm_do_read_one(struct aufs_stbr __user *stbr,
+			      struct aufs_stfs *stfs, __s16 brid)
+{
+	int err;
+
+	err = copy_to_user(&stbr->stfs, stfs, sizeof(*stfs));
+	if (!err)
+		err = __put_user(brid, &stbr->brid);
+	if (unlikely(err))
+		err = -EFAULT;
+
+	return err;
+}
+
+static ssize_t au_fhsm_do_read(struct super_block *sb,
+			       struct aufs_stbr __user *stbr, size_t count)
+{
+	ssize_t err;
+	int nstbr;
+	aufs_bindex_t bindex, bend;
+	struct au_branch *br;
+	struct au_br_fhsm *bf;
+
+	/* except the bottom branch */
+	err = 0;
+	nstbr = 0;
+	bend = au_fhsm_bottom(sb);
+	for (bindex = 0; !err && bindex < bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		if (!au_br_fhsm(br->br_perm))
+			continue;
+
+		bf = br->br_fhsm;
+		mutex_lock(&bf->bf_lock);
+		if (bf->bf_readable) {
+			err = -EFAULT;
+			if (count >= sizeof(*stbr))
+				err = au_fhsm_do_read_one(stbr++, &bf->bf_stfs,
+							  br->br_id);
+			if (!err) {
+				bf->bf_readable = 0;
+				count -= sizeof(*stbr);
+				nstbr++;
+			}
+		}
+		mutex_unlock(&bf->bf_lock);
+	}
+	if (!err)
+		err = sizeof(*stbr) * nstbr;
+
+	return err;
+}
+
+static ssize_t au_fhsm_read(struct file *file, char __user *buf, size_t count,
+			   loff_t *pos)
+{
+	ssize_t err;
+	int readable;
+	aufs_bindex_t nfhsm, bindex, bend;
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+	struct au_branch *br;
+	struct super_block *sb;
+
+	err = 0;
+	sbinfo = file->private_data;
+	fhsm = &sbinfo->si_fhsm;
+need_data:
+	spin_lock_irq(&fhsm->fhsm_wqh.lock);
+	if (!atomic_read(&fhsm->fhsm_readable)) {
+		if (vfsub_file_flags(file) & O_NONBLOCK)
+			err = -EAGAIN;
+		else
+			err = wait_event_interruptible_locked_irq
+				(fhsm->fhsm_wqh,
+				 atomic_read(&fhsm->fhsm_readable));
+	}
+	spin_unlock_irq(&fhsm->fhsm_wqh.lock);
+	if (unlikely(err))
+		goto out;
+
+	/* sb may already be dead */
+	au_rw_read_lock(&sbinfo->si_rwsem);
+	readable = atomic_read(&fhsm->fhsm_readable);
+	if (readable > 0) {
+		sb = sbinfo->si_sb;
+		AuDebugOn(!sb);
+		/* exclude the bottom branch */
+		nfhsm = 0;
+		bend = au_fhsm_bottom(sb);
+		for (bindex = 0; bindex < bend; bindex++) {
+			br = au_sbr(sb, bindex);
+			if (au_br_fhsm(br->br_perm))
+				nfhsm++;
+		}
+		err = -EMSGSIZE;
+		if (nfhsm * sizeof(struct aufs_stbr) <= count) {
+			atomic_set(&fhsm->fhsm_readable, 0);
+			err = au_fhsm_do_read(sbinfo->si_sb, (void __user *)buf,
+					     count);
+		}
+	}
+	au_rw_read_unlock(&sbinfo->si_rwsem);
+	if (!readable)
+		goto need_data;
+
+out:
+	return err;
+}
+
+static int au_fhsm_release(struct inode *inode, struct file *file)
+{
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	/* sb may already be dead */
+	sbinfo = file->private_data;
+	fhsm = &sbinfo->si_fhsm;
+	spin_lock(&fhsm->fhsm_spin);
+	fhsm->fhsm_pid = 0;
+	spin_unlock(&fhsm->fhsm_spin);
+	kobject_put(&sbinfo->si_kobj);
+
+	return 0;
+}
+
+static const struct file_operations au_fhsm_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+	.read		= au_fhsm_read,
+	.poll		= au_fhsm_poll,
+	.release	= au_fhsm_release
+};
+
+int au_fhsm_fd(struct super_block *sb, int oflags)
+{
+	int err, fd;
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	err = -EPERM;
+	if (unlikely(!capable(CAP_SYS_ADMIN)))
+		goto out;
+
+	err = -EINVAL;
+	if (unlikely(oflags & ~(O_CLOEXEC | O_NONBLOCK)))
+		goto out;
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	spin_lock(&fhsm->fhsm_spin);
+	if (!fhsm->fhsm_pid)
+		fhsm->fhsm_pid = current->pid;
+	else
+		err = -EBUSY;
+	spin_unlock(&fhsm->fhsm_spin);
+	if (unlikely(err))
+		goto out;
+
+	oflags |= O_RDONLY;
+	/* oflags |= FMODE_NONOTIFY; */
+	fd = anon_inode_getfd("[aufs_fhsm]", &au_fhsm_fops, sbinfo, oflags);
+	err = fd;
+	if (unlikely(fd < 0))
+		goto out_pid;
+
+	/* succeed reglardless 'fhsm' status */
+	kobject_get(&sbinfo->si_kobj);
+	si_noflush_read_lock(sb);
+	if (au_ftest_si(sbinfo, FHSM))
+		au_fhsm_wrote_all(sb, /*force*/0);
+	si_read_unlock(sb);
+	goto out; /* success */
+
+out_pid:
+	spin_lock(&fhsm->fhsm_spin);
+	fhsm->fhsm_pid = 0;
+	spin_unlock(&fhsm->fhsm_spin);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_fhsm_br_alloc(struct au_branch *br)
+{
+	int err;
+
+	err = 0;
+	br->br_fhsm = kmalloc(sizeof(*br->br_fhsm), GFP_NOFS);
+	if (br->br_fhsm)
+		au_br_fhsm_init(br->br_fhsm);
+	else
+		err = -ENOMEM;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_fhsm_fin(struct super_block *sb)
+{
+	au_fhsm_notify(sb, /*val*/-1);
+}
+
+void au_fhsm_init(struct au_sbinfo *sbinfo)
+{
+	struct au_fhsm *fhsm;
+
+	fhsm = &sbinfo->si_fhsm;
+	spin_lock_init(&fhsm->fhsm_spin);
+	init_waitqueue_head(&fhsm->fhsm_wqh);
+	atomic_set(&fhsm->fhsm_readable, 0);
+	fhsm->fhsm_expire
+		= msecs_to_jiffies(AUFS_FHSM_CACHE_DEF_SEC * MSEC_PER_SEC);
+	fhsm->fhsm_bottom = -1;
+}
+
+void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec)
+{
+	sbinfo->si_fhsm.fhsm_expire
+		= msecs_to_jiffies(sec * MSEC_PER_SEC);
+}
+
+void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo)
+{
+	unsigned int u;
+
+	if (!au_ftest_si(sbinfo, FHSM))
+		return;
+
+	u = jiffies_to_msecs(sbinfo->si_fhsm.fhsm_expire) / MSEC_PER_SEC;
+	if (u != AUFS_FHSM_CACHE_DEF_SEC)
+		seq_printf(seq, ",fhsm_sec=%u", u);
+}
diff --git b/fs/aufs/file.c b/fs/aufs/file.c
new file mode 100644
index 0000000..6b8a66b
--- /dev/null
+++ b/fs/aufs/file.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * handling file/dir, and address_space operation
+ */
+
+#ifdef CONFIG_AUFS_DEBUG
+#include <linux/migrate.h>
+#endif
+#include <linux/pagemap.h>
+#include "aufs.h"
+
+/* drop flags for writing */
+unsigned int au_file_roflags(unsigned int flags)
+{
+	flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC);
+	flags |= O_RDONLY | O_NOATIME;
+	return flags;
+}
+
+/* common functions to regular file and dir */
+struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags,
+		       struct file *file, int force_wr)
+{
+	struct file *h_file;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct path h_path;
+	int err;
+
+	/* a race condition can happen between open and unlink/rmdir */
+	h_file = ERR_PTR(-ENOENT);
+	h_dentry = au_h_dptr(dentry, bindex);
+	if (au_test_nfsd() && (!h_dentry || d_is_negative(h_dentry)))
+		goto out;
+	h_inode = d_inode(h_dentry);
+	spin_lock(&h_dentry->d_lock);
+	err = (!d_unhashed(dentry) && d_unlinked(h_dentry))
+		/* || !d_inode(dentry)->i_nlink */
+		;
+	spin_unlock(&h_dentry->d_lock);
+	if (unlikely(err))
+		goto out;
+
+	sb = dentry->d_sb;
+	br = au_sbr(sb, bindex);
+	err = au_br_test_oflag(flags, br);
+	h_file = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	/* drop flags for writing */
+	if (au_test_ro(sb, bindex, d_inode(dentry))) {
+		if (force_wr && !(flags & O_WRONLY))
+			force_wr = 0;
+		flags = au_file_roflags(flags);
+		if (force_wr) {
+			h_file = ERR_PTR(-EROFS);
+			flags = au_file_roflags(flags);
+			if (unlikely(vfsub_native_ro(h_inode)
+				     || IS_APPEND(h_inode)))
+				goto out;
+			flags &= ~O_ACCMODE;
+			flags |= O_WRONLY;
+		}
+	}
+	flags &= ~O_CREAT;
+	atomic_inc(&br->br_count);
+	h_path.dentry = h_dentry;
+	h_path.mnt = au_br_mnt(br);
+	h_file = vfsub_dentry_open(&h_path, flags);
+	if (IS_ERR(h_file))
+		goto out_br;
+
+	if (flags & __FMODE_EXEC) {
+		err = deny_write_access(h_file);
+		if (unlikely(err)) {
+			fput(h_file);
+			h_file = ERR_PTR(err);
+			goto out_br;
+		}
+	}
+	fsnotify_open(h_file);
+	goto out; /* success */
+
+out_br:
+	atomic_dec(&br->br_count);
+out:
+	return h_file;
+}
+
+static int au_cmoo(struct dentry *dentry)
+{
+	int err, cmoo;
+	unsigned int udba;
+	struct path h_path;
+	struct au_pin pin;
+	struct au_cp_generic cpg = {
+		.dentry	= dentry,
+		.bdst	= -1,
+		.bsrc	= -1,
+		.len	= -1,
+		.pin	= &pin,
+		.flags	= AuCpup_DTIME | AuCpup_HOPEN
+	};
+	struct inode *delegated;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+	pid_t pid;
+	struct au_branch *br;
+	struct dentry *parent;
+	struct au_hinode *hdir;
+
+	DiMustWriteLock(dentry);
+	IiMustWriteLock(d_inode(dentry));
+
+	err = 0;
+	if (IS_ROOT(dentry))
+		goto out;
+	cpg.bsrc = au_dbstart(dentry);
+	if (!cpg.bsrc)
+		goto out;
+
+	sb = dentry->d_sb;
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	pid = au_fhsm_pid(fhsm);
+	if (pid
+	    && (current->pid == pid
+		|| current->real_parent->pid == pid))
+		goto out;
+
+	br = au_sbr(sb, cpg.bsrc);
+	cmoo = au_br_cmoo(br->br_perm);
+	if (!cmoo)
+		goto out;
+	if (!d_is_reg(dentry))
+		cmoo &= AuBrAttr_COO_ALL;
+	if (!cmoo)
+		goto out;
+
+	parent = dget_parent(dentry);
+	di_write_lock_parent(parent);
+	err = au_wbr_do_copyup_bu(dentry, cpg.bsrc - 1);
+	cpg.bdst = err;
+	if (unlikely(err < 0)) {
+		err = 0;	/* there is no upper writable branch */
+		goto out_dgrade;
+	}
+	AuDbg("bsrc %d, bdst %d\n", cpg.bsrc, cpg.bdst);
+
+	/* do not respect the coo attrib for the target branch */
+	err = au_cpup_dirs(dentry, cpg.bdst);
+	if (unlikely(err))
+		goto out_dgrade;
+
+	di_downgrade_lock(parent, AuLock_IR);
+	udba = au_opt_udba(sb);
+	err = au_pin(&pin, dentry, cpg.bdst, udba,
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	if (unlikely(err))
+		goto out_parent;
+
+	err = au_sio_cpup_simple(&cpg);
+	au_unpin(&pin);
+	if (unlikely(err))
+		goto out_parent;
+	if (!(cmoo & AuBrWAttr_MOO))
+		goto out_parent; /* success */
+
+	err = au_pin(&pin, dentry, cpg.bsrc, udba,
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	if (unlikely(err))
+		goto out_parent;
+
+	h_path.mnt = au_br_mnt(br);
+	h_path.dentry = au_h_dptr(dentry, cpg.bsrc);
+	hdir = au_hi(d_inode(parent), cpg.bsrc);
+	delegated = NULL;
+	err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated, /*force*/1);
+	au_unpin(&pin);
+	/* todo: keep h_dentry or not? */
+	if (unlikely(err == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal unlink\n");
+		iput(delegated);
+	}
+	if (unlikely(err)) {
+		pr_err("unlink %pd after coo failed (%d), ignored\n",
+		       dentry, err);
+		err = 0;
+	}
+	goto out_parent; /* success */
+
+out_dgrade:
+	di_downgrade_lock(parent, AuLock_IR);
+out_parent:
+	di_read_unlock(parent, AuLock_IR);
+	dput(parent);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int au_do_open(struct file *file, struct au_do_open_args *args)
+{
+	int err, no_lock = args->no_lock;
+	struct dentry *dentry;
+	struct au_finfo *finfo;
+
+	if (!no_lock)
+		err = au_finfo_init(file, args->fidir);
+	else {
+		lockdep_off();
+		err = au_finfo_init(file, args->fidir);
+		lockdep_on();
+	}
+	if (unlikely(err))
+		goto out;
+
+	dentry = file->f_path.dentry;
+	AuDebugOn(IS_ERR_OR_NULL(dentry));
+	if (!no_lock) {
+		di_write_lock_child(dentry);
+		err = au_cmoo(dentry);
+		di_downgrade_lock(dentry, AuLock_IR);
+		if (!err)
+			err = args->open(file, vfsub_file_flags(file), NULL);
+		di_read_unlock(dentry, AuLock_IR);
+	} else {
+		err = au_cmoo(dentry);
+		if (!err)
+			err = args->open(file, vfsub_file_flags(file),
+					 args->h_file);
+		if (!err && au_fbstart(file) != au_dbstart(dentry))
+			/*
+			 * cmoo happens after h_file was opened.
+			 * need to refresh file later.
+			 */
+			atomic_dec(&au_fi(file)->fi_generation);
+	}
+
+	finfo = au_fi(file);
+	if (!err) {
+		finfo->fi_file = file;
+		au_sphl_add(&finfo->fi_hlist,
+			    &au_sbi(file->f_path.dentry->d_sb)->si_files);
+	}
+	if (!no_lock)
+		fi_write_unlock(file);
+	else {
+		lockdep_off();
+		fi_write_unlock(file);
+		lockdep_on();
+	}
+	if (unlikely(err)) {
+		finfo->fi_hdir = NULL;
+		au_finfo_fin(file);
+	}
+
+out:
+	return err;
+}
+
+int au_reopen_nondir(struct file *file)
+{
+	int err;
+	aufs_bindex_t bstart;
+	struct dentry *dentry;
+	struct file *h_file, *h_file_tmp;
+
+	dentry = file->f_path.dentry;
+	bstart = au_dbstart(dentry);
+	h_file_tmp = NULL;
+	if (au_fbstart(file) == bstart) {
+		h_file = au_hf_top(file);
+		if (file->f_mode == h_file->f_mode)
+			return 0; /* success */
+		h_file_tmp = h_file;
+		get_file(h_file_tmp);
+		au_set_h_fptr(file, bstart, NULL);
+	}
+	AuDebugOn(au_fi(file)->fi_hdir);
+	/*
+	 * it can happen
+	 * file exists on both of rw and ro
+	 * open --> dbstart and fbstart are both 0
+	 * prepend a branch as rw, "rw" become ro
+	 * remove rw/file
+	 * delete the top branch, "rw" becomes rw again
+	 *	--> dbstart is 1, fbstart is still 0
+	 * write --> fbstart is 0 but dbstart is 1
+	 */
+	/* AuDebugOn(au_fbstart(file) < bstart); */
+
+	h_file = au_h_open(dentry, bstart, vfsub_file_flags(file) & ~O_TRUNC,
+			   file, /*force_wr*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file)) {
+		if (h_file_tmp) {
+			atomic_inc(&au_sbr(dentry->d_sb, bstart)->br_count);
+			au_set_h_fptr(file, bstart, h_file_tmp);
+			h_file_tmp = NULL;
+		}
+		goto out; /* todo: close all? */
+	}
+
+	err = 0;
+	au_set_fbstart(file, bstart);
+	au_set_h_fptr(file, bstart, h_file);
+	au_update_figen(file);
+	/* todo: necessary? */
+	/* file->f_ra = h_file->f_ra; */
+
+out:
+	if (h_file_tmp)
+		fput(h_file_tmp);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_reopen_wh(struct file *file, aufs_bindex_t btgt,
+			struct dentry *hi_wh)
+{
+	int err;
+	aufs_bindex_t bstart;
+	struct au_dinfo *dinfo;
+	struct dentry *h_dentry;
+	struct au_hdentry *hdp;
+
+	dinfo = au_di(file->f_path.dentry);
+	AuRwMustWriteLock(&dinfo->di_rwsem);
+
+	bstart = dinfo->di_bstart;
+	dinfo->di_bstart = btgt;
+	hdp = dinfo->di_hdentry;
+	h_dentry = hdp[0 + btgt].hd_dentry;
+	hdp[0 + btgt].hd_dentry = hi_wh;
+	err = au_reopen_nondir(file);
+	hdp[0 + btgt].hd_dentry = h_dentry;
+	dinfo->di_bstart = bstart;
+
+	return err;
+}
+
+static int au_ready_to_write_wh(struct file *file, loff_t len,
+				aufs_bindex_t bcpup, struct au_pin *pin)
+{
+	int err;
+	struct inode *inode, *h_inode;
+	struct dentry *h_dentry, *hi_wh;
+	struct au_cp_generic cpg = {
+		.dentry	= file->f_path.dentry,
+		.bdst	= bcpup,
+		.bsrc	= -1,
+		.len	= len,
+		.pin	= pin
+	};
+
+	au_update_dbstart(cpg.dentry);
+	inode = d_inode(cpg.dentry);
+	h_inode = NULL;
+	if (au_dbstart(cpg.dentry) <= bcpup
+	    && au_dbend(cpg.dentry) >= bcpup) {
+		h_dentry = au_h_dptr(cpg.dentry, bcpup);
+		if (h_dentry && d_is_positive(h_dentry))
+			h_inode = d_inode(h_dentry);
+	}
+	hi_wh = au_hi_wh(inode, bcpup);
+	if (!hi_wh && !h_inode)
+		err = au_sio_cpup_wh(&cpg, file);
+	else
+		/* already copied-up after unlink */
+		err = au_reopen_wh(file, bcpup, hi_wh);
+
+	if (!err
+	    && (inode->i_nlink > 1
+		|| (inode->i_state & I_LINKABLE))
+	    && au_opt_test(au_mntflags(cpg.dentry->d_sb), PLINK))
+		au_plink_append(inode, bcpup, au_h_dptr(cpg.dentry, bcpup));
+
+	return err;
+}
+
+/*
+ * prepare the @file for writing.
+ */
+int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin)
+{
+	int err;
+	aufs_bindex_t dbstart;
+	struct dentry *parent;
+	struct inode *inode;
+	struct super_block *sb;
+	struct file *h_file;
+	struct au_cp_generic cpg = {
+		.dentry	= file->f_path.dentry,
+		.bdst	= -1,
+		.bsrc	= -1,
+		.len	= len,
+		.pin	= pin,
+		.flags	= AuCpup_DTIME
+	};
+
+	sb = cpg.dentry->d_sb;
+	inode = d_inode(cpg.dentry);
+	cpg.bsrc = au_fbstart(file);
+	err = au_test_ro(sb, cpg.bsrc, inode);
+	if (!err && (au_hf_top(file)->f_mode & FMODE_WRITE)) {
+		err = au_pin(pin, cpg.dentry, cpg.bsrc, AuOpt_UDBA_NONE,
+			     /*flags*/0);
+		goto out;
+	}
+
+	/* need to cpup or reopen */
+	parent = dget_parent(cpg.dentry);
+	di_write_lock_parent(parent);
+	err = AuWbrCopyup(au_sbi(sb), cpg.dentry);
+	cpg.bdst = err;
+	if (unlikely(err < 0))
+		goto out_dgrade;
+	err = 0;
+
+	if (!d_unhashed(cpg.dentry) && !au_h_dptr(parent, cpg.bdst)) {
+		err = au_cpup_dirs(cpg.dentry, cpg.bdst);
+		if (unlikely(err))
+			goto out_dgrade;
+	}
+
+	err = au_pin(pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE,
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	if (unlikely(err))
+		goto out_dgrade;
+
+	dbstart = au_dbstart(cpg.dentry);
+	if (dbstart <= cpg.bdst)
+		cpg.bsrc = cpg.bdst;
+
+	if (dbstart <= cpg.bdst		/* just reopen */
+	    || !d_unhashed(cpg.dentry)	/* copyup and reopen */
+		) {
+		h_file = au_h_open_pre(cpg.dentry, cpg.bsrc, /*force_wr*/0);
+		if (IS_ERR(h_file))
+			err = PTR_ERR(h_file);
+		else {
+			di_downgrade_lock(parent, AuLock_IR);
+			if (dbstart > cpg.bdst)
+				err = au_sio_cpup_simple(&cpg);
+			if (!err)
+				err = au_reopen_nondir(file);
+			au_h_open_post(cpg.dentry, cpg.bsrc, h_file);
+		}
+	} else {			/* copyup as wh and reopen */
+		/*
+		 * since writable hfsplus branch is not supported,
+		 * h_open_pre/post() are unnecessary.
+		 */
+		err = au_ready_to_write_wh(file, len, cpg.bdst, pin);
+		di_downgrade_lock(parent, AuLock_IR);
+	}
+
+	if (!err) {
+		au_pin_set_parent_lflag(pin, /*lflag*/0);
+		goto out_dput; /* success */
+	}
+	au_unpin(pin);
+	goto out_unlock;
+
+out_dgrade:
+	di_downgrade_lock(parent, AuLock_IR);
+out_unlock:
+	di_read_unlock(parent, AuLock_IR);
+out_dput:
+	dput(parent);
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_do_flush(struct file *file, fl_owner_t id,
+		int (*flush)(struct file *file, fl_owner_t id))
+{
+	int err;
+	struct super_block *sb;
+	struct inode *inode;
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	si_noflush_read_lock(sb);
+	fi_read_lock(file);
+	ii_read_lock_child(inode);
+
+	err = flush(file, id);
+	au_cpup_attr_timesizes(inode);
+
+	ii_read_unlock(inode);
+	fi_read_unlock(file);
+	si_read_unlock(sb);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_file_refresh_by_inode(struct file *file, int *need_reopen)
+{
+	int err;
+	struct au_pin pin;
+	struct au_finfo *finfo;
+	struct dentry *parent, *hi_wh;
+	struct inode *inode;
+	struct super_block *sb;
+	struct au_cp_generic cpg = {
+		.dentry	= file->f_path.dentry,
+		.bdst	= -1,
+		.bsrc	= -1,
+		.len	= -1,
+		.pin	= &pin,
+		.flags	= AuCpup_DTIME
+	};
+
+	FiMustWriteLock(file);
+
+	err = 0;
+	finfo = au_fi(file);
+	sb = cpg.dentry->d_sb;
+	inode = d_inode(cpg.dentry);
+	cpg.bdst = au_ibstart(inode);
+	if (cpg.bdst == finfo->fi_btop || IS_ROOT(cpg.dentry))
+		goto out;
+
+	parent = dget_parent(cpg.dentry);
+	if (au_test_ro(sb, cpg.bdst, inode)) {
+		di_read_lock_parent(parent, !AuLock_IR);
+		err = AuWbrCopyup(au_sbi(sb), cpg.dentry);
+		cpg.bdst = err;
+		di_read_unlock(parent, !AuLock_IR);
+		if (unlikely(err < 0))
+			goto out_parent;
+		err = 0;
+	}
+
+	di_read_lock_parent(parent, AuLock_IR);
+	hi_wh = au_hi_wh(inode, cpg.bdst);
+	if (!S_ISDIR(inode->i_mode)
+	    && au_opt_test(au_mntflags(sb), PLINK)
+	    && au_plink_test(inode)
+	    && !d_unhashed(cpg.dentry)
+	    && cpg.bdst < au_dbstart(cpg.dentry)) {
+		err = au_test_and_cpup_dirs(cpg.dentry, cpg.bdst);
+		if (unlikely(err))
+			goto out_unlock;
+
+		/* always superio. */
+		err = au_pin(&pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE,
+			     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+		if (!err) {
+			err = au_sio_cpup_simple(&cpg);
+			au_unpin(&pin);
+		}
+	} else if (hi_wh) {
+		/* already copied-up after unlink */
+		err = au_reopen_wh(file, cpg.bdst, hi_wh);
+		*need_reopen = 0;
+	}
+
+out_unlock:
+	di_read_unlock(parent, AuLock_IR);
+out_parent:
+	dput(parent);
+out:
+	return err;
+}
+
+static void au_do_refresh_dir(struct file *file)
+{
+	aufs_bindex_t bindex, bend, new_bindex, brid;
+	struct au_hfile *p, tmp, *q;
+	struct au_finfo *finfo;
+	struct super_block *sb;
+	struct au_fidir *fidir;
+
+	FiMustWriteLock(file);
+
+	sb = file->f_path.dentry->d_sb;
+	finfo = au_fi(file);
+	fidir = finfo->fi_hdir;
+	AuDebugOn(!fidir);
+	p = fidir->fd_hfile + finfo->fi_btop;
+	brid = p->hf_br->br_id;
+	bend = fidir->fd_bbot;
+	for (bindex = finfo->fi_btop; bindex <= bend; bindex++, p++) {
+		if (!p->hf_file)
+			continue;
+
+		new_bindex = au_br_index(sb, p->hf_br->br_id);
+		if (new_bindex == bindex)
+			continue;
+		if (new_bindex < 0) {
+			au_set_h_fptr(file, bindex, NULL);
+			continue;
+		}
+
+		/* swap two lower inode, and loop again */
+		q = fidir->fd_hfile + new_bindex;
+		tmp = *q;
+		*q = *p;
+		*p = tmp;
+		if (tmp.hf_file) {
+			bindex--;
+			p--;
+		}
+	}
+
+	p = fidir->fd_hfile;
+	if (!au_test_mmapped(file) && !d_unlinked(file->f_path.dentry)) {
+		bend = au_sbend(sb);
+		for (finfo->fi_btop = 0; finfo->fi_btop <= bend;
+		     finfo->fi_btop++, p++)
+			if (p->hf_file) {
+				if (file_inode(p->hf_file))
+					break;
+				au_hfput(p, file);
+			}
+	} else {
+		bend = au_br_index(sb, brid);
+		for (finfo->fi_btop = 0; finfo->fi_btop < bend;
+		     finfo->fi_btop++, p++)
+			if (p->hf_file)
+				au_hfput(p, file);
+		bend = au_sbend(sb);
+	}
+
+	p = fidir->fd_hfile + bend;
+	for (fidir->fd_bbot = bend; fidir->fd_bbot >= finfo->fi_btop;
+	     fidir->fd_bbot--, p--)
+		if (p->hf_file) {
+			if (file_inode(p->hf_file))
+				break;
+			au_hfput(p, file);
+		}
+	AuDebugOn(fidir->fd_bbot < finfo->fi_btop);
+}
+
+/*
+ * after branch manipulating, refresh the file.
+ */
+static int refresh_file(struct file *file, int (*reopen)(struct file *file))
+{
+	int err, need_reopen;
+	aufs_bindex_t bend, bindex;
+	struct dentry *dentry;
+	struct au_finfo *finfo;
+	struct au_hfile *hfile;
+
+	dentry = file->f_path.dentry;
+	finfo = au_fi(file);
+	if (!finfo->fi_hdir) {
+		hfile = &finfo->fi_htop;
+		AuDebugOn(!hfile->hf_file);
+		bindex = au_br_index(dentry->d_sb, hfile->hf_br->br_id);
+		AuDebugOn(bindex < 0);
+		if (bindex != finfo->fi_btop)
+			au_set_fbstart(file, bindex);
+	} else {
+		err = au_fidir_realloc(finfo, au_sbend(dentry->d_sb) + 1);
+		if (unlikely(err))
+			goto out;
+		au_do_refresh_dir(file);
+	}
+
+	err = 0;
+	need_reopen = 1;
+	if (!au_test_mmapped(file))
+		err = au_file_refresh_by_inode(file, &need_reopen);
+	if (!err && need_reopen && !d_unlinked(dentry))
+		err = reopen(file);
+	if (!err) {
+		au_update_figen(file);
+		goto out; /* success */
+	}
+
+	/* error, close all lower files */
+	if (finfo->fi_hdir) {
+		bend = au_fbend_dir(file);
+		for (bindex = au_fbstart(file); bindex <= bend; bindex++)
+			au_set_h_fptr(file, bindex, NULL);
+	}
+
+out:
+	return err;
+}
+
+/* common function to regular file and dir */
+int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file),
+			  int wlock)
+{
+	int err;
+	unsigned int sigen, figen;
+	aufs_bindex_t bstart;
+	unsigned char pseudo_link;
+	struct dentry *dentry;
+	struct inode *inode;
+
+	err = 0;
+	dentry = file->f_path.dentry;
+	inode = d_inode(dentry);
+	sigen = au_sigen(dentry->d_sb);
+	fi_write_lock(file);
+	figen = au_figen(file);
+	di_write_lock_child(dentry);
+	bstart = au_dbstart(dentry);
+	pseudo_link = (bstart != au_ibstart(inode));
+	if (sigen == figen && !pseudo_link && au_fbstart(file) == bstart) {
+		if (!wlock) {
+			di_downgrade_lock(dentry, AuLock_IR);
+			fi_downgrade_lock(file);
+		}
+		goto out; /* success */
+	}
+
+	AuDbg("sigen %d, figen %d\n", sigen, figen);
+	if (au_digen_test(dentry, sigen)) {
+		err = au_reval_dpath(dentry, sigen);
+		AuDebugOn(!err && au_digen_test(dentry, sigen));
+	}
+
+	if (!err)
+		err = refresh_file(file, reopen);
+	if (!err) {
+		if (!wlock) {
+			di_downgrade_lock(dentry, AuLock_IR);
+			fi_downgrade_lock(file);
+		}
+	} else {
+		di_write_unlock(dentry);
+		fi_write_unlock(file);
+	}
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* cf. aufs_nopage() */
+/* for madvise(2) */
+static int aufs_readpage(struct file *file __maybe_unused, struct page *page)
+{
+	unlock_page(page);
+	return 0;
+}
+
+/* it will never be called, but necessary to support O_DIRECT */
+static ssize_t aufs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			      loff_t offset)
+{ BUG(); return 0; }
+
+/* they will never be called. */
+#ifdef CONFIG_AUFS_DEBUG
+static int aufs_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{ AuUnsupport(); return 0; }
+static int aufs_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{ AuUnsupport(); return 0; }
+static int aufs_writepage(struct page *page, struct writeback_control *wbc)
+{ AuUnsupport(); return 0; }
+
+static int aufs_set_page_dirty(struct page *page)
+{ AuUnsupport(); return 0; }
+static void aufs_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
+{ AuUnsupport(); }
+static int aufs_releasepage(struct page *page, gfp_t gfp)
+{ AuUnsupport(); return 0; }
+#if 0 /* called by memory compaction regardless file */
+static int aufs_migratepage(struct address_space *mapping, struct page *newpage,
+			    struct page *page, enum migrate_mode mode)
+{ AuUnsupport(); return 0; }
+#endif
+static int aufs_launder_page(struct page *page)
+{ AuUnsupport(); return 0; }
+static int aufs_is_partially_uptodate(struct page *page,
+				      unsigned long from,
+				      unsigned long count)
+{ AuUnsupport(); return 0; }
+static void aufs_is_dirty_writeback(struct page *page, bool *dirty,
+				    bool *writeback)
+{ AuUnsupport(); }
+static int aufs_error_remove_page(struct address_space *mapping,
+				  struct page *page)
+{ AuUnsupport(); return 0; }
+static int aufs_swap_activate(struct swap_info_struct *sis, struct file *file,
+			      sector_t *span)
+{ AuUnsupport(); return 0; }
+static void aufs_swap_deactivate(struct file *file)
+{ AuUnsupport(); }
+#endif /* CONFIG_AUFS_DEBUG */
+
+const struct address_space_operations aufs_aop = {
+	.readpage		= aufs_readpage,
+	.direct_IO		= aufs_direct_IO,
+#ifdef CONFIG_AUFS_DEBUG
+	.writepage		= aufs_writepage,
+	/* no writepages, because of writepage */
+	.set_page_dirty		= aufs_set_page_dirty,
+	/* no readpages, because of readpage */
+	.write_begin		= aufs_write_begin,
+	.write_end		= aufs_write_end,
+	/* no bmap, no block device */
+	.invalidatepage		= aufs_invalidatepage,
+	.releasepage		= aufs_releasepage,
+	/* is fallback_migrate_page ok? */
+	/* .migratepage		= aufs_migratepage, */
+	.launder_page		= aufs_launder_page,
+	.is_partially_uptodate	= aufs_is_partially_uptodate,
+	.is_dirty_writeback	= aufs_is_dirty_writeback,
+	.error_remove_page	= aufs_error_remove_page,
+	.swap_activate		= aufs_swap_activate,
+	.swap_deactivate	= aufs_swap_deactivate
+#endif /* CONFIG_AUFS_DEBUG */
+};
diff --git b/fs/aufs/file.h b/fs/aufs/file.h
new file mode 100644
index 0000000..27d8024
--- /dev/null
+++ b/fs/aufs/file.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * file operations
+ */
+
+#ifndef __AUFS_FILE_H__
+#define __AUFS_FILE_H__
+
+#ifdef __KERNEL__
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include "rwsem.h"
+
+struct au_branch;
+struct au_hfile {
+	struct file		*hf_file;
+	struct au_branch	*hf_br;
+};
+
+struct au_vdir;
+struct au_fidir {
+	aufs_bindex_t		fd_bbot;
+	aufs_bindex_t		fd_nent;
+	struct au_vdir		*fd_vdir_cache;
+	struct au_hfile		fd_hfile[];
+};
+
+static inline int au_fidir_sz(int nent)
+{
+	AuDebugOn(nent < 0);
+	return sizeof(struct au_fidir) + sizeof(struct au_hfile) * nent;
+}
+
+struct au_finfo {
+	atomic_t		fi_generation;
+
+	struct au_rwsem		fi_rwsem;
+	aufs_bindex_t		fi_btop;
+
+	/* do not union them */
+	struct {				/* for non-dir */
+		struct au_hfile			fi_htop;
+		atomic_t			fi_mmapped;
+	};
+	struct au_fidir		*fi_hdir;	/* for dir only */
+
+	struct hlist_node	fi_hlist;
+	struct file		*fi_file;	/* very ugly */
+} ____cacheline_aligned_in_smp;
+
+/* ---------------------------------------------------------------------- */
+
+/* file.c */
+extern const struct address_space_operations aufs_aop;
+unsigned int au_file_roflags(unsigned int flags);
+struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags,
+		       struct file *file, int force_wr);
+struct au_do_open_args {
+	int		no_lock;
+	int		(*open)(struct file *file, int flags,
+				struct file *h_file);
+	struct au_fidir	*fidir;
+	struct file	*h_file;
+};
+int au_do_open(struct file *file, struct au_do_open_args *args);
+int au_reopen_nondir(struct file *file);
+struct au_pin;
+int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin);
+int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file),
+			  int wlock);
+int au_do_flush(struct file *file, fl_owner_t id,
+		int (*flush)(struct file *file, fl_owner_t id));
+
+/* poll.c */
+#ifdef CONFIG_AUFS_POLL
+unsigned int aufs_poll(struct file *file, poll_table *wait);
+#endif
+
+#ifdef CONFIG_AUFS_BR_HFSPLUS
+/* hfsplus.c */
+struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex,
+			   int force_wr);
+void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex,
+		    struct file *h_file);
+#else
+AuStub(struct file *, au_h_open_pre, return NULL, struct dentry *dentry,
+       aufs_bindex_t bindex, int force_wr)
+AuStubVoid(au_h_open_post, struct dentry *dentry, aufs_bindex_t bindex,
+	   struct file *h_file);
+#endif
+
+/* f_op.c */
+extern const struct file_operations aufs_file_fop;
+int au_do_open_nondir(struct file *file, int flags, struct file *h_file);
+int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file);
+struct file *au_read_pre(struct file *file, int keep_fi);
+
+/* finfo.c */
+void au_hfput(struct au_hfile *hf, struct file *file);
+void au_set_h_fptr(struct file *file, aufs_bindex_t bindex,
+		   struct file *h_file);
+
+void au_update_figen(struct file *file);
+struct au_fidir *au_fidir_alloc(struct super_block *sb);
+int au_fidir_realloc(struct au_finfo *finfo, int nbr);
+
+void au_fi_init_once(void *_fi);
+void au_finfo_fin(struct file *file);
+int au_finfo_init(struct file *file, struct au_fidir *fidir);
+
+/* ioctl.c */
+long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg);
+#ifdef CONFIG_COMPAT
+long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd,
+			   unsigned long arg);
+long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd,
+			      unsigned long arg);
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct au_finfo *au_fi(struct file *file)
+{
+	return file->private_data;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * fi_read_lock, fi_write_lock,
+ * fi_read_unlock, fi_write_unlock, fi_downgrade_lock
+ */
+AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem);
+
+#define FiMustNoWaiters(f)	AuRwMustNoWaiters(&au_fi(f)->fi_rwsem)
+#define FiMustAnyLock(f)	AuRwMustAnyLock(&au_fi(f)->fi_rwsem)
+#define FiMustWriteLock(f)	AuRwMustWriteLock(&au_fi(f)->fi_rwsem)
+
+/* ---------------------------------------------------------------------- */
+
+/* todo: hard/soft set? */
+static inline aufs_bindex_t au_fbstart(struct file *file)
+{
+	FiMustAnyLock(file);
+	return au_fi(file)->fi_btop;
+}
+
+static inline aufs_bindex_t au_fbend_dir(struct file *file)
+{
+	FiMustAnyLock(file);
+	AuDebugOn(!au_fi(file)->fi_hdir);
+	return au_fi(file)->fi_hdir->fd_bbot;
+}
+
+static inline struct au_vdir *au_fvdir_cache(struct file *file)
+{
+	FiMustAnyLock(file);
+	AuDebugOn(!au_fi(file)->fi_hdir);
+	return au_fi(file)->fi_hdir->fd_vdir_cache;
+}
+
+static inline void au_set_fbstart(struct file *file, aufs_bindex_t bindex)
+{
+	FiMustWriteLock(file);
+	au_fi(file)->fi_btop = bindex;
+}
+
+static inline void au_set_fbend_dir(struct file *file, aufs_bindex_t bindex)
+{
+	FiMustWriteLock(file);
+	AuDebugOn(!au_fi(file)->fi_hdir);
+	au_fi(file)->fi_hdir->fd_bbot = bindex;
+}
+
+static inline void au_set_fvdir_cache(struct file *file,
+				      struct au_vdir *vdir_cache)
+{
+	FiMustWriteLock(file);
+	AuDebugOn(!au_fi(file)->fi_hdir);
+	au_fi(file)->fi_hdir->fd_vdir_cache = vdir_cache;
+}
+
+static inline struct file *au_hf_top(struct file *file)
+{
+	FiMustAnyLock(file);
+	AuDebugOn(au_fi(file)->fi_hdir);
+	return au_fi(file)->fi_htop.hf_file;
+}
+
+static inline struct file *au_hf_dir(struct file *file, aufs_bindex_t bindex)
+{
+	FiMustAnyLock(file);
+	AuDebugOn(!au_fi(file)->fi_hdir);
+	return au_fi(file)->fi_hdir->fd_hfile[0 + bindex].hf_file;
+}
+
+/* todo: memory barrier? */
+static inline unsigned int au_figen(struct file *f)
+{
+	return atomic_read(&au_fi(f)->fi_generation);
+}
+
+static inline void au_set_mmapped(struct file *f)
+{
+	if (atomic_inc_return(&au_fi(f)->fi_mmapped))
+		return;
+	pr_warn("fi_mmapped wrapped around\n");
+	while (!atomic_inc_return(&au_fi(f)->fi_mmapped))
+		;
+}
+
+static inline void au_unset_mmapped(struct file *f)
+{
+	atomic_dec(&au_fi(f)->fi_mmapped);
+}
+
+static inline int au_test_mmapped(struct file *f)
+{
+	return atomic_read(&au_fi(f)->fi_mmapped);
+}
+
+/* customize vma->vm_file */
+
+static inline void au_do_vm_file_reset(struct vm_area_struct *vma,
+				       struct file *file)
+{
+	struct file *f;
+
+	f = vma->vm_file;
+	get_file(file);
+	vma->vm_file = file;
+	fput(f);
+}
+
+#ifdef CONFIG_MMU
+#define AuDbgVmRegion(file, vma) do {} while (0)
+
+static inline void au_vm_file_reset(struct vm_area_struct *vma,
+				    struct file *file)
+{
+	au_do_vm_file_reset(vma, file);
+}
+#else
+#define AuDbgVmRegion(file, vma) \
+	AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file))
+
+static inline void au_vm_file_reset(struct vm_area_struct *vma,
+				    struct file *file)
+{
+	struct file *f;
+
+	au_do_vm_file_reset(vma, file);
+	f = vma->vm_region->vm_file;
+	get_file(file);
+	vma->vm_region->vm_file = file;
+	fput(f);
+}
+#endif /* CONFIG_MMU */
+
+/* handle vma->vm_prfile */
+static inline void au_vm_prfile_set(struct vm_area_struct *vma,
+				    struct file *file)
+{
+	get_file(file);
+	vma->vm_prfile = file;
+#ifndef CONFIG_MMU
+	get_file(file);
+	vma->vm_region->vm_prfile = file;
+#endif
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_FILE_H__ */
diff --git b/fs/aufs/finfo.c b/fs/aufs/finfo.c
new file mode 100644
index 0000000..b5eb55d
--- /dev/null
+++ b/fs/aufs/finfo.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * file private data
+ */
+
+#include "aufs.h"
+
+void au_hfput(struct au_hfile *hf, struct file *file)
+{
+	/* todo: direct access f_flags */
+	if (vfsub_file_flags(file) & __FMODE_EXEC)
+		allow_write_access(hf->hf_file);
+	fput(hf->hf_file);
+	hf->hf_file = NULL;
+	atomic_dec(&hf->hf_br->br_count);
+	hf->hf_br = NULL;
+}
+
+void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val)
+{
+	struct au_finfo *finfo = au_fi(file);
+	struct au_hfile *hf;
+	struct au_fidir *fidir;
+
+	fidir = finfo->fi_hdir;
+	if (!fidir) {
+		AuDebugOn(finfo->fi_btop != bindex);
+		hf = &finfo->fi_htop;
+	} else
+		hf = fidir->fd_hfile + bindex;
+
+	if (hf && hf->hf_file)
+		au_hfput(hf, file);
+	if (val) {
+		FiMustWriteLock(file);
+		AuDebugOn(IS_ERR_OR_NULL(file->f_path.dentry));
+		hf->hf_file = val;
+		hf->hf_br = au_sbr(file->f_path.dentry->d_sb, bindex);
+	}
+}
+
+void au_update_figen(struct file *file)
+{
+	atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_path.dentry));
+	/* smp_mb(); */ /* atomic_set */
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_fidir *au_fidir_alloc(struct super_block *sb)
+{
+	struct au_fidir *fidir;
+	int nbr;
+
+	nbr = au_sbend(sb) + 1;
+	if (nbr < 2)
+		nbr = 2; /* initial allocate for 2 branches */
+	fidir = kzalloc(au_fidir_sz(nbr), GFP_NOFS);
+	if (fidir) {
+		fidir->fd_bbot = -1;
+		fidir->fd_nent = nbr;
+	}
+
+	return fidir;
+}
+
+int au_fidir_realloc(struct au_finfo *finfo, int nbr)
+{
+	int err;
+	struct au_fidir *fidir, *p;
+
+	AuRwMustWriteLock(&finfo->fi_rwsem);
+	fidir = finfo->fi_hdir;
+	AuDebugOn(!fidir);
+
+	err = -ENOMEM;
+	p = au_kzrealloc(fidir, au_fidir_sz(fidir->fd_nent), au_fidir_sz(nbr),
+			 GFP_NOFS);
+	if (p) {
+		p->fd_nent = nbr;
+		finfo->fi_hdir = p;
+		err = 0;
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_finfo_fin(struct file *file)
+{
+	struct au_finfo *finfo;
+
+	au_nfiles_dec(file->f_path.dentry->d_sb);
+
+	finfo = au_fi(file);
+	AuDebugOn(finfo->fi_hdir);
+	AuRwDestroy(&finfo->fi_rwsem);
+	au_cache_free_finfo(finfo);
+}
+
+void au_fi_init_once(void *_finfo)
+{
+	struct au_finfo *finfo = _finfo;
+	static struct lock_class_key aufs_fi;
+
+	au_rw_init(&finfo->fi_rwsem);
+	au_rw_class(&finfo->fi_rwsem, &aufs_fi);
+}
+
+int au_finfo_init(struct file *file, struct au_fidir *fidir)
+{
+	int err;
+	struct au_finfo *finfo;
+	struct dentry *dentry;
+
+	err = -ENOMEM;
+	dentry = file->f_path.dentry;
+	finfo = au_cache_alloc_finfo();
+	if (unlikely(!finfo))
+		goto out;
+
+	err = 0;
+	au_nfiles_inc(dentry->d_sb);
+	/* verbose coding for lock class name */
+	if (!fidir)
+		au_rw_class(&finfo->fi_rwsem, au_lc_key + AuLcNonDir_FIINFO);
+	else
+		au_rw_class(&finfo->fi_rwsem, au_lc_key + AuLcDir_FIINFO);
+	au_rw_write_lock(&finfo->fi_rwsem);
+	finfo->fi_btop = -1;
+	finfo->fi_hdir = fidir;
+	atomic_set(&finfo->fi_generation, au_digen(dentry));
+	/* smp_mb(); */ /* atomic_set */
+
+	file->private_data = finfo;
+
+out:
+	return err;
+}
diff --git b/fs/aufs/fstype.h b/fs/aufs/fstype.h
new file mode 100644
index 0000000..725b2ff
--- /dev/null
+++ b/fs/aufs/fstype.h
@@ -0,0 +1,387 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * judging filesystem type
+ */
+
+#ifndef __AUFS_FSTYPE_H__
+#define __AUFS_FSTYPE_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/nfs_fs.h>
+#include <linux/romfs_fs.h>
+
+static inline int au_test_aufs(struct super_block *sb)
+{
+	return sb->s_magic == AUFS_SUPER_MAGIC;
+}
+
+static inline const char *au_sbtype(struct super_block *sb)
+{
+	return sb->s_type->name;
+}
+
+static inline int au_test_iso9660(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE)
+	return sb->s_magic == ISOFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_romfs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE)
+	return sb->s_magic == ROMFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_cramfs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE)
+	return sb->s_magic == CRAMFS_MAGIC;
+#endif
+	return 0;
+}
+
+static inline int au_test_nfs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE)
+	return sb->s_magic == NFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_fuse(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE)
+	return sb->s_magic == FUSE_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_xfs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE)
+	return sb->s_magic == XFS_SB_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_tmpfs(struct super_block *sb __maybe_unused)
+{
+#ifdef CONFIG_TMPFS
+	return sb->s_magic == TMPFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE)
+	return !strcmp(au_sbtype(sb), "ecryptfs");
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_ramfs(struct super_block *sb)
+{
+	return sb->s_magic == RAMFS_MAGIC;
+}
+
+static inline int au_test_ubifs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE)
+	return sb->s_magic == UBIFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_procfs(struct super_block *sb __maybe_unused)
+{
+#ifdef CONFIG_PROC_FS
+	return sb->s_magic == PROC_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_sysfs(struct super_block *sb __maybe_unused)
+{
+#ifdef CONFIG_SYSFS
+	return sb->s_magic == SYSFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_configfs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE)
+	return sb->s_magic == CONFIGFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_minix(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE)
+	return sb->s_magic == MINIX3_SUPER_MAGIC
+		|| sb->s_magic == MINIX2_SUPER_MAGIC
+		|| sb->s_magic == MINIX2_SUPER_MAGIC2
+		|| sb->s_magic == MINIX_SUPER_MAGIC
+		|| sb->s_magic == MINIX_SUPER_MAGIC2;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_fat(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE)
+	return sb->s_magic == MSDOS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_msdos(struct super_block *sb)
+{
+	return au_test_fat(sb);
+}
+
+static inline int au_test_vfat(struct super_block *sb)
+{
+	return au_test_fat(sb);
+}
+
+static inline int au_test_securityfs(struct super_block *sb __maybe_unused)
+{
+#ifdef CONFIG_SECURITYFS
+	return sb->s_magic == SECURITYFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_squashfs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE)
+	return sb->s_magic == SQUASHFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_btrfs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE)
+	return sb->s_magic == BTRFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_xenfs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE)
+	return sb->s_magic == XENFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_debugfs(struct super_block *sb __maybe_unused)
+{
+#ifdef CONFIG_DEBUG_FS
+	return sb->s_magic == DEBUGFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_nilfs(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE)
+	return sb->s_magic == NILFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_hfsplus(struct super_block *sb __maybe_unused)
+{
+#if defined(CONFIG_HFSPLUS_FS) || defined(CONFIG_HFSPLUS_FS_MODULE)
+	return sb->s_magic == HFSPLUS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * they can't be an aufs branch.
+ */
+static inline int au_test_fs_unsuppoted(struct super_block *sb)
+{
+	return
+#ifndef CONFIG_AUFS_BR_RAMFS
+		au_test_ramfs(sb) ||
+#endif
+		au_test_procfs(sb)
+		|| au_test_sysfs(sb)
+		|| au_test_configfs(sb)
+		|| au_test_debugfs(sb)
+		|| au_test_securityfs(sb)
+		|| au_test_xenfs(sb)
+		|| au_test_ecryptfs(sb)
+		/* || !strcmp(au_sbtype(sb), "unionfs") */
+		|| au_test_aufs(sb); /* will be supported in next version */
+}
+
+static inline int au_test_fs_remote(struct super_block *sb)
+{
+	return !au_test_tmpfs(sb)
+#ifdef CONFIG_AUFS_BR_RAMFS
+		&& !au_test_ramfs(sb)
+#endif
+		&& !(sb->s_type->fs_flags & FS_REQUIRES_DEV);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * Note: these functions (below) are created after reading ->getattr() in all
+ * filesystems under linux/fs. it means we have to do so in every update...
+ */
+
+/*
+ * some filesystems require getattr to refresh the inode attributes before
+ * referencing.
+ * in most cases, we can rely on the inode attribute in NFS (or every remote fs)
+ * and leave the work for d_revalidate()
+ */
+static inline int au_test_fs_refresh_iattr(struct super_block *sb)
+{
+	return au_test_nfs(sb)
+		|| au_test_fuse(sb)
+		/* || au_test_btrfs(sb) */	/* untested */
+		;
+}
+
+/*
+ * filesystems which don't maintain i_size or i_blocks.
+ */
+static inline int au_test_fs_bad_iattr_size(struct super_block *sb)
+{
+	return au_test_xfs(sb)
+		|| au_test_btrfs(sb)
+		|| au_test_ubifs(sb)
+		|| au_test_hfsplus(sb)	/* maintained, but incorrect */
+		/* || au_test_minix(sb) */	/* untested */
+		;
+}
+
+/*
+ * filesystems which don't store the correct value in some of their inode
+ * attributes.
+ */
+static inline int au_test_fs_bad_iattr(struct super_block *sb)
+{
+	return au_test_fs_bad_iattr_size(sb)
+		|| au_test_fat(sb)
+		|| au_test_msdos(sb)
+		|| au_test_vfat(sb);
+}
+
+/* they don't check i_nlink in link(2) */
+static inline int au_test_fs_no_limit_nlink(struct super_block *sb)
+{
+	return au_test_tmpfs(sb)
+#ifdef CONFIG_AUFS_BR_RAMFS
+		|| au_test_ramfs(sb)
+#endif
+		|| au_test_ubifs(sb)
+		|| au_test_hfsplus(sb);
+}
+
+/*
+ * filesystems which sets S_NOATIME and S_NOCMTIME.
+ */
+static inline int au_test_fs_notime(struct super_block *sb)
+{
+	return au_test_nfs(sb)
+		|| au_test_fuse(sb)
+		|| au_test_ubifs(sb)
+		;
+}
+
+/* temporary support for i#1 in cramfs */
+static inline int au_test_fs_unique_ino(struct inode *inode)
+{
+	if (au_test_cramfs(inode->i_sb))
+		return inode->i_ino != 1;
+	return 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * the filesystem where the xino files placed must support i/o after unlink and
+ * maintain i_size and i_blocks.
+ */
+static inline int au_test_fs_bad_xino(struct super_block *sb)
+{
+	return au_test_fs_remote(sb)
+		|| au_test_fs_bad_iattr_size(sb)
+		/* don't want unnecessary work for xino */
+		|| au_test_aufs(sb)
+		|| au_test_ecryptfs(sb)
+		|| au_test_nilfs(sb);
+}
+
+static inline int au_test_fs_trunc_xino(struct super_block *sb)
+{
+	return au_test_tmpfs(sb)
+		|| au_test_ramfs(sb);
+}
+
+/*
+ * test if the @sb is real-readonly.
+ */
+static inline int au_test_fs_rr(struct super_block *sb)
+{
+	return au_test_squashfs(sb)
+		|| au_test_iso9660(sb)
+		|| au_test_cramfs(sb)
+		|| au_test_romfs(sb);
+}
+
+/*
+ * test if the @inode is nfs with 'noacl' option
+ * NFS always sets MS_POSIXACL regardless its mount option 'noacl.'
+ */
+static inline int au_test_nfs_noacl(struct inode *inode)
+{
+	return au_test_nfs(inode->i_sb)
+		/* && IS_POSIXACL(inode) */
+		&& !nfs_server_capable(inode, NFS_CAP_ACLS);
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_FSTYPE_H__ */
diff --git b/fs/aufs/hfsnotify.c b/fs/aufs/hfsnotify.c
new file mode 100644
index 0000000..c0a1a63
--- /dev/null
+++ b/fs/aufs/hfsnotify.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * fsnotify for the lower directories
+ */
+
+#include "aufs.h"
+
+/* FS_IN_IGNORED is unnecessary */
+static const __u32 AuHfsnMask = (FS_MOVED_TO | FS_MOVED_FROM | FS_DELETE
+				 | FS_CREATE | FS_EVENT_ON_CHILD);
+static DECLARE_WAIT_QUEUE_HEAD(au_hfsn_wq);
+static __cacheline_aligned_in_smp atomic64_t au_hfsn_ifree = ATOMIC64_INIT(0);
+
+static void au_hfsn_free_mark(struct fsnotify_mark *mark)
+{
+	struct au_hnotify *hn = container_of(mark, struct au_hnotify,
+					     hn_mark);
+	AuDbg("here\n");
+	au_cache_free_hnotify(hn);
+	smp_mb__before_atomic();
+	if (atomic64_dec_and_test(&au_hfsn_ifree))
+		wake_up(&au_hfsn_wq);
+}
+
+static int au_hfsn_alloc(struct au_hinode *hinode)
+{
+	int err;
+	struct au_hnotify *hn;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct fsnotify_mark *mark;
+	aufs_bindex_t bindex;
+
+	hn = hinode->hi_notify;
+	sb = hn->hn_aufs_inode->i_sb;
+	bindex = au_br_index(sb, hinode->hi_id);
+	br = au_sbr(sb, bindex);
+	AuDebugOn(!br->br_hfsn);
+
+	mark = &hn->hn_mark;
+	fsnotify_init_mark(mark, au_hfsn_free_mark);
+	mark->mask = AuHfsnMask;
+	/*
+	 * by udba rename or rmdir, aufs assign a new inode to the known
+	 * h_inode, so specify 1 to allow dups.
+	 */
+	lockdep_off();
+	err = fsnotify_add_mark(mark, br->br_hfsn->hfsn_group, hinode->hi_inode,
+				 /*mnt*/NULL, /*allow_dups*/1);
+	/* even if err */
+	fsnotify_put_mark(mark);
+	lockdep_on();
+
+	return err;
+}
+
+static int au_hfsn_free(struct au_hinode *hinode, struct au_hnotify *hn)
+{
+	struct fsnotify_mark *mark;
+	unsigned long long ull;
+	struct fsnotify_group *group;
+
+	ull = atomic64_inc_return(&au_hfsn_ifree);
+	BUG_ON(!ull);
+
+	mark = &hn->hn_mark;
+	spin_lock(&mark->lock);
+	group = mark->group;
+	fsnotify_get_group(group);
+	spin_unlock(&mark->lock);
+	lockdep_off();
+	fsnotify_destroy_mark(mark, group);
+	fsnotify_put_group(group);
+	lockdep_on();
+
+	/* free hn by myself */
+	return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void au_hfsn_ctl(struct au_hinode *hinode, int do_set)
+{
+	struct fsnotify_mark *mark;
+
+	mark = &hinode->hi_notify->hn_mark;
+	spin_lock(&mark->lock);
+	if (do_set) {
+		AuDebugOn(mark->mask & AuHfsnMask);
+		mark->mask |= AuHfsnMask;
+	} else {
+		AuDebugOn(!(mark->mask & AuHfsnMask));
+		mark->mask &= ~AuHfsnMask;
+	}
+	spin_unlock(&mark->lock);
+	/* fsnotify_recalc_inode_mask(hinode->hi_inode); */
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* #define AuDbgHnotify */
+#ifdef AuDbgHnotify
+static char *au_hfsn_name(u32 mask)
+{
+#ifdef CONFIG_AUFS_DEBUG
+#define test_ret(flag)				\
+	do {					\
+		if (mask & flag)		\
+			return #flag;		\
+	} while (0)
+	test_ret(FS_ACCESS);
+	test_ret(FS_MODIFY);
+	test_ret(FS_ATTRIB);
+	test_ret(FS_CLOSE_WRITE);
+	test_ret(FS_CLOSE_NOWRITE);
+	test_ret(FS_OPEN);
+	test_ret(FS_MOVED_FROM);
+	test_ret(FS_MOVED_TO);
+	test_ret(FS_CREATE);
+	test_ret(FS_DELETE);
+	test_ret(FS_DELETE_SELF);
+	test_ret(FS_MOVE_SELF);
+	test_ret(FS_UNMOUNT);
+	test_ret(FS_Q_OVERFLOW);
+	test_ret(FS_IN_IGNORED);
+	test_ret(FS_ISDIR);
+	test_ret(FS_IN_ONESHOT);
+	test_ret(FS_EVENT_ON_CHILD);
+	return "";
+#undef test_ret
+#else
+	return "??";
+#endif
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static void au_hfsn_free_group(struct fsnotify_group *group)
+{
+	struct au_br_hfsnotify *hfsn = group->private;
+
+	AuDbg("here\n");
+	kfree(hfsn);
+}
+
+static int au_hfsn_handle_event(struct fsnotify_group *group,
+				struct inode *inode,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
+				u32 mask, void *data, int data_type,
+				const unsigned char *file_name, u32 cookie)
+{
+	int err;
+	struct au_hnotify *hnotify;
+	struct inode *h_dir, *h_inode;
+	struct qstr h_child_qstr = QSTR_INIT(file_name, strlen(file_name));
+
+	AuDebugOn(data_type != FSNOTIFY_EVENT_INODE);
+
+	err = 0;
+	/* if FS_UNMOUNT happens, there must be another bug */
+	AuDebugOn(mask & FS_UNMOUNT);
+	if (mask & (FS_IN_IGNORED | FS_UNMOUNT))
+		goto out;
+
+	h_dir = inode;
+	h_inode = NULL;
+#ifdef AuDbgHnotify
+	au_debug_on();
+	if (1 || h_child_qstr.len != sizeof(AUFS_XINO_FNAME) - 1
+	    || strncmp(h_child_qstr.name, AUFS_XINO_FNAME, h_child_qstr.len)) {
+		AuDbg("i%lu, mask 0x%x %s, hcname %.*s, hi%lu\n",
+		      h_dir->i_ino, mask, au_hfsn_name(mask),
+		      AuLNPair(&h_child_qstr), h_inode ? h_inode->i_ino : 0);
+		/* WARN_ON(1); */
+	}
+	au_debug_off();
+#endif
+
+	AuDebugOn(!inode_mark);
+	hnotify = container_of(inode_mark, struct au_hnotify, hn_mark);
+	err = au_hnotify(h_dir, hnotify, mask, &h_child_qstr, h_inode);
+
+out:
+	return err;
+}
+
+static struct fsnotify_ops au_hfsn_ops = {
+	.handle_event		= au_hfsn_handle_event,
+	.free_group_priv	= au_hfsn_free_group
+};
+
+/* ---------------------------------------------------------------------- */
+
+static void au_hfsn_fin_br(struct au_branch *br)
+{
+	struct au_br_hfsnotify *hfsn;
+
+	hfsn = br->br_hfsn;
+	if (hfsn) {
+		lockdep_off();
+		fsnotify_put_group(hfsn->hfsn_group);
+		lockdep_on();
+	}
+}
+
+static int au_hfsn_init_br(struct au_branch *br, int perm)
+{
+	int err;
+	struct fsnotify_group *group;
+	struct au_br_hfsnotify *hfsn;
+
+	err = 0;
+	br->br_hfsn = NULL;
+	if (!au_br_hnotifyable(perm))
+		goto out;
+
+	err = -ENOMEM;
+	hfsn = kmalloc(sizeof(*hfsn), GFP_NOFS);
+	if (unlikely(!hfsn))
+		goto out;
+
+	err = 0;
+	group = fsnotify_alloc_group(&au_hfsn_ops);
+	if (IS_ERR(group)) {
+		err = PTR_ERR(group);
+		pr_err("fsnotify_alloc_group() failed, %d\n", err);
+		goto out_hfsn;
+	}
+
+	group->private = hfsn;
+	hfsn->hfsn_group = group;
+	br->br_hfsn = hfsn;
+	goto out; /* success */
+
+out_hfsn:
+	kfree(hfsn);
+out:
+	return err;
+}
+
+static int au_hfsn_reset_br(unsigned int udba, struct au_branch *br, int perm)
+{
+	int err;
+
+	err = 0;
+	if (!br->br_hfsn)
+		err = au_hfsn_init_br(br, perm);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void au_hfsn_fin(void)
+{
+	AuDbg("au_hfsn_ifree %lld\n", (long long)atomic64_read(&au_hfsn_ifree));
+	wait_event(au_hfsn_wq, !atomic64_read(&au_hfsn_ifree));
+}
+
+const struct au_hnotify_op au_hnotify_op = {
+	.ctl		= au_hfsn_ctl,
+	.alloc		= au_hfsn_alloc,
+	.free		= au_hfsn_free,
+
+	.fin		= au_hfsn_fin,
+
+	.reset_br	= au_hfsn_reset_br,
+	.fin_br		= au_hfsn_fin_br,
+	.init_br	= au_hfsn_init_br
+};
diff --git b/fs/aufs/hfsplus.c b/fs/aufs/hfsplus.c
new file mode 100644
index 0000000..145c6ac
--- /dev/null
+++ b/fs/aufs/hfsplus.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2010-2016 Junjiro R. Okajima
+ */
+
+/*
+ * special support for filesystems which aqucires an inode mutex
+ * at final closing a file, eg, hfsplus.
+ *
+ * This trick is very simple and stupid, just to open the file before really
+ * neceeary open to tell hfsplus that this is not the final closing.
+ * The caller should call au_h_open_pre() after acquiring the inode mutex,
+ * and au_h_open_post() after releasing it.
+ */
+
+#include "aufs.h"
+
+struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex,
+			   int force_wr)
+{
+	struct file *h_file;
+	struct dentry *h_dentry;
+
+	h_dentry = au_h_dptr(dentry, bindex);
+	AuDebugOn(!h_dentry);
+	AuDebugOn(d_is_negative(h_dentry));
+
+	h_file = NULL;
+	if (au_test_hfsplus(h_dentry->d_sb)
+	    && d_is_reg(h_dentry))
+		h_file = au_h_open(dentry, bindex,
+				   O_RDONLY | O_NOATIME | O_LARGEFILE,
+				   /*file*/NULL, force_wr);
+	return h_file;
+}
+
+void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex,
+		    struct file *h_file)
+{
+	if (h_file) {
+		fput(h_file);
+		au_sbr_put(dentry->d_sb, bindex);
+	}
+}
diff --git b/fs/aufs/hnotify.c b/fs/aufs/hnotify.c
new file mode 100644
index 0000000..92e432b
--- /dev/null
+++ b/fs/aufs/hnotify.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * abstraction to notify the direct changes on lower directories
+ */
+
+#include "aufs.h"
+
+int au_hn_alloc(struct au_hinode *hinode, struct inode *inode)
+{
+	int err;
+	struct au_hnotify *hn;
+
+	err = -ENOMEM;
+	hn = au_cache_alloc_hnotify();
+	if (hn) {
+		hn->hn_aufs_inode = inode;
+		hinode->hi_notify = hn;
+		err = au_hnotify_op.alloc(hinode);
+		AuTraceErr(err);
+		if (unlikely(err)) {
+			hinode->hi_notify = NULL;
+			au_cache_free_hnotify(hn);
+			/*
+			 * The upper dir was removed by udba, but the same named
+			 * dir left. In this case, aufs assignes a new inode
+			 * number and set the monitor again.
+			 * For the lower dir, the old monitnor is still left.
+			 */
+			if (err == -EEXIST)
+				err = 0;
+		}
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+void au_hn_free(struct au_hinode *hinode)
+{
+	struct au_hnotify *hn;
+
+	hn = hinode->hi_notify;
+	if (hn) {
+		hinode->hi_notify = NULL;
+		if (au_hnotify_op.free(hinode, hn))
+			au_cache_free_hnotify(hn);
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_hn_ctl(struct au_hinode *hinode, int do_set)
+{
+	if (hinode->hi_notify)
+		au_hnotify_op.ctl(hinode, do_set);
+}
+
+void au_hn_reset(struct inode *inode, unsigned int flags)
+{
+	aufs_bindex_t bindex, bend;
+	struct inode *hi;
+	struct dentry *iwhdentry;
+
+	bend = au_ibend(inode);
+	for (bindex = au_ibstart(inode); bindex <= bend; bindex++) {
+		hi = au_h_iptr(inode, bindex);
+		if (!hi)
+			continue;
+
+		/* inode_lock_nested(hi, AuLsc_I_CHILD); */
+		iwhdentry = au_hi_wh(inode, bindex);
+		if (iwhdentry)
+			dget(iwhdentry);
+		au_igrab(hi);
+		au_set_h_iptr(inode, bindex, NULL, 0);
+		au_set_h_iptr(inode, bindex, au_igrab(hi),
+			      flags & ~AuHi_XINO);
+		iput(hi);
+		dput(iwhdentry);
+		/* inode_unlock(hi); */
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int hn_xino(struct inode *inode, struct inode *h_inode)
+{
+	int err;
+	aufs_bindex_t bindex, bend, bfound, bstart;
+	struct inode *h_i;
+
+	err = 0;
+	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
+		pr_warn("branch root dir was changed\n");
+		goto out;
+	}
+
+	bfound = -1;
+	bend = au_ibend(inode);
+	bstart = au_ibstart(inode);
+#if 0 /* reserved for future use */
+	if (bindex == bend) {
+		/* keep this ino in rename case */
+		goto out;
+	}
+#endif
+	for (bindex = bstart; bindex <= bend; bindex++)
+		if (au_h_iptr(inode, bindex) == h_inode) {
+			bfound = bindex;
+			break;
+		}
+	if (bfound < 0)
+		goto out;
+
+	for (bindex = bstart; bindex <= bend; bindex++) {
+		h_i = au_h_iptr(inode, bindex);
+		if (!h_i)
+			continue;
+
+		err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0);
+		/* ignore this error */
+		/* bad action? */
+	}
+
+	/* children inode number will be broken */
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int hn_gen_tree(struct dentry *dentry)
+{
+	int err, i, j, ndentry;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry **dentries;
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	err = au_dcsub_pages(&dpages, dentry, NULL, NULL);
+	if (unlikely(err))
+		goto out_dpages;
+
+	for (i = 0; i < dpages.ndpage; i++) {
+		dpage = dpages.dpages + i;
+		dentries = dpage->dentries;
+		ndentry = dpage->ndentry;
+		for (j = 0; j < ndentry; j++) {
+			struct dentry *d;
+
+			d = dentries[j];
+			if (IS_ROOT(d))
+				continue;
+
+			au_digen_dec(d);
+			if (d_really_is_positive(d))
+				/* todo: reset children xino?
+				   cached children only? */
+				au_iigen_dec(d_inode(d));
+		}
+	}
+
+out_dpages:
+	au_dpages_free(&dpages);
+
+#if 0
+	/* discard children */
+	dentry_unhash(dentry);
+	dput(dentry);
+#endif
+out:
+	return err;
+}
+
+/*
+ * return 0 if processed.
+ */
+static int hn_gen_by_inode(char *name, unsigned int nlen, struct inode *inode,
+			   const unsigned int isdir)
+{
+	int err;
+	struct dentry *d;
+	struct qstr *dname;
+
+	err = 1;
+	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
+		pr_warn("branch root dir was changed\n");
+		err = 0;
+		goto out;
+	}
+
+	if (!isdir) {
+		AuDebugOn(!name);
+		au_iigen_dec(inode);
+		spin_lock(&inode->i_lock);
+		hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) {
+			spin_lock(&d->d_lock);
+			dname = &d->d_name;
+			if (dname->len != nlen
+			    && memcmp(dname->name, name, nlen)) {
+				spin_unlock(&d->d_lock);
+				continue;
+			}
+			err = 0;
+			au_digen_dec(d);
+			spin_unlock(&d->d_lock);
+			break;
+		}
+		spin_unlock(&inode->i_lock);
+	} else {
+		au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIR);
+		d = d_find_any_alias(inode);
+		if (!d) {
+			au_iigen_dec(inode);
+			goto out;
+		}
+
+		spin_lock(&d->d_lock);
+		dname = &d->d_name;
+		if (dname->len == nlen && !memcmp(dname->name, name, nlen)) {
+			spin_unlock(&d->d_lock);
+			err = hn_gen_tree(d);
+			spin_lock(&d->d_lock);
+		}
+		spin_unlock(&d->d_lock);
+		dput(d);
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int hn_gen_by_name(struct dentry *dentry, const unsigned int isdir)
+{
+	int err;
+
+	if (IS_ROOT(dentry)) {
+		pr_warn("branch root dir was changed\n");
+		return 0;
+	}
+
+	err = 0;
+	if (!isdir) {
+		au_digen_dec(dentry);
+		if (d_really_is_positive(dentry))
+			au_iigen_dec(d_inode(dentry));
+	} else {
+		au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR);
+		if (d_really_is_positive(dentry))
+			err = hn_gen_tree(dentry);
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* hnotify job flags */
+#define AuHnJob_XINO0		1
+#define AuHnJob_GEN		(1 << 1)
+#define AuHnJob_DIRENT		(1 << 2)
+#define AuHnJob_ISDIR		(1 << 3)
+#define AuHnJob_TRYXINO0	(1 << 4)
+#define AuHnJob_MNTPNT		(1 << 5)
+#define au_ftest_hnjob(flags, name)	((flags) & AuHnJob_##name)
+#define au_fset_hnjob(flags, name) \
+	do { (flags) |= AuHnJob_##name; } while (0)
+#define au_fclr_hnjob(flags, name) \
+	do { (flags) &= ~AuHnJob_##name; } while (0)
+
+enum {
+	AuHn_CHILD,
+	AuHn_PARENT,
+	AuHnLast
+};
+
+struct au_hnotify_args {
+	struct inode *h_dir, *dir, *h_child_inode;
+	u32 mask;
+	unsigned int flags[AuHnLast];
+	unsigned int h_child_nlen;
+	char h_child_name[];
+};
+
+struct hn_job_args {
+	unsigned int flags;
+	struct inode *inode, *h_inode, *dir, *h_dir;
+	struct dentry *dentry;
+	char *h_name;
+	int h_nlen;
+};
+
+static int hn_job(struct hn_job_args *a)
+{
+	const unsigned int isdir = au_ftest_hnjob(a->flags, ISDIR);
+	int e;
+
+	/* reset xino */
+	if (au_ftest_hnjob(a->flags, XINO0) && a->inode)
+		hn_xino(a->inode, a->h_inode); /* ignore this error */
+
+	if (au_ftest_hnjob(a->flags, TRYXINO0)
+	    && a->inode
+	    && a->h_inode) {
+		inode_lock_nested(a->h_inode, AuLsc_I_CHILD);
+		if (!a->h_inode->i_nlink
+		    && !(a->h_inode->i_state & I_LINKABLE))
+			hn_xino(a->inode, a->h_inode); /* ignore this error */
+		inode_unlock(a->h_inode);
+	}
+
+	/* make the generation obsolete */
+	if (au_ftest_hnjob(a->flags, GEN)) {
+		e = -1;
+		if (a->inode)
+			e = hn_gen_by_inode(a->h_name, a->h_nlen, a->inode,
+					      isdir);
+		if (e && a->dentry)
+			hn_gen_by_name(a->dentry, isdir);
+		/* ignore this error */
+	}
+
+	/* make dir entries obsolete */
+	if (au_ftest_hnjob(a->flags, DIRENT) && a->inode) {
+		struct au_vdir *vdir;
+
+		vdir = au_ivdir(a->inode);
+		if (vdir)
+			vdir->vd_jiffy = 0;
+		/* IMustLock(a->inode); */
+		/* a->inode->i_version++; */
+	}
+
+	/* can do nothing but warn */
+	if (au_ftest_hnjob(a->flags, MNTPNT)
+	    && a->dentry
+	    && d_mountpoint(a->dentry))
+		pr_warn("mount-point %pd is removed or renamed\n", a->dentry);
+
+	return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen,
+					   struct inode *dir)
+{
+	struct dentry *dentry, *d, *parent;
+	struct qstr *dname;
+
+	parent = d_find_any_alias(dir);
+	if (!parent)
+		return NULL;
+
+	dentry = NULL;
+	spin_lock(&parent->d_lock);
+	list_for_each_entry(d, &parent->d_subdirs, d_child) {
+		/* AuDbg("%pd\n", d); */
+		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+		dname = &d->d_name;
+		if (dname->len != nlen || memcmp(dname->name, name, nlen))
+			goto cont_unlock;
+		if (au_di(d))
+			au_digen_dec(d);
+		else
+			goto cont_unlock;
+		if (au_dcount(d) > 0) {
+			dentry = dget_dlock(d);
+			spin_unlock(&d->d_lock);
+			break;
+		}
+
+cont_unlock:
+		spin_unlock(&d->d_lock);
+	}
+	spin_unlock(&parent->d_lock);
+	dput(parent);
+
+	if (dentry)
+		di_write_lock_child(dentry);
+
+	return dentry;
+}
+
+static struct inode *lookup_wlock_by_ino(struct super_block *sb,
+					 aufs_bindex_t bindex, ino_t h_ino)
+{
+	struct inode *inode;
+	ino_t ino;
+	int err;
+
+	inode = NULL;
+	err = au_xino_read(sb, bindex, h_ino, &ino);
+	if (!err && ino)
+		inode = ilookup(sb, ino);
+	if (!inode)
+		goto out;
+
+	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
+		pr_warn("wrong root branch\n");
+		iput(inode);
+		inode = NULL;
+		goto out;
+	}
+
+	ii_write_lock_child(inode);
+
+out:
+	return inode;
+}
+
+static void au_hn_bh(void *_args)
+{
+	struct au_hnotify_args *a = _args;
+	struct super_block *sb;
+	aufs_bindex_t bindex, bend, bfound;
+	unsigned char xino, try_iput;
+	int err;
+	struct inode *inode;
+	ino_t h_ino;
+	struct hn_job_args args;
+	struct dentry *dentry;
+	struct au_sbinfo *sbinfo;
+
+	AuDebugOn(!_args);
+	AuDebugOn(!a->h_dir);
+	AuDebugOn(!a->dir);
+	AuDebugOn(!a->mask);
+	AuDbg("mask 0x%x, i%lu, hi%lu, hci%lu\n",
+	      a->mask, a->dir->i_ino, a->h_dir->i_ino,
+	      a->h_child_inode ? a->h_child_inode->i_ino : 0);
+
+	inode = NULL;
+	dentry = NULL;
+	/*
+	 * do not lock a->dir->i_mutex here
+	 * because of d_revalidate() may cause a deadlock.
+	 */
+	sb = a->dir->i_sb;
+	AuDebugOn(!sb);
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!sbinfo);
+	si_write_lock(sb, AuLock_NOPLMW);
+
+	ii_read_lock_parent(a->dir);
+	bfound = -1;
+	bend = au_ibend(a->dir);
+	for (bindex = au_ibstart(a->dir); bindex <= bend; bindex++)
+		if (au_h_iptr(a->dir, bindex) == a->h_dir) {
+			bfound = bindex;
+			break;
+		}
+	ii_read_unlock(a->dir);
+	if (unlikely(bfound < 0))
+		goto out;
+
+	xino = !!au_opt_test(au_mntflags(sb), XINO);
+	h_ino = 0;
+	if (a->h_child_inode)
+		h_ino = a->h_child_inode->i_ino;
+
+	if (a->h_child_nlen
+	    && (au_ftest_hnjob(a->flags[AuHn_CHILD], GEN)
+		|| au_ftest_hnjob(a->flags[AuHn_CHILD], MNTPNT)))
+		dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen,
+					      a->dir);
+	try_iput = 0;
+	if (dentry && d_really_is_positive(dentry))
+		inode = d_inode(dentry);
+	if (xino && !inode && h_ino
+	    && (au_ftest_hnjob(a->flags[AuHn_CHILD], XINO0)
+		|| au_ftest_hnjob(a->flags[AuHn_CHILD], TRYXINO0)
+		|| au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) {
+		inode = lookup_wlock_by_ino(sb, bfound, h_ino);
+		try_iput = 1;
+	    }
+
+	args.flags = a->flags[AuHn_CHILD];
+	args.dentry = dentry;
+	args.inode = inode;
+	args.h_inode = a->h_child_inode;
+	args.dir = a->dir;
+	args.h_dir = a->h_dir;
+	args.h_name = a->h_child_name;
+	args.h_nlen = a->h_child_nlen;
+	err = hn_job(&args);
+	if (dentry) {
+		if (au_di(dentry))
+			di_write_unlock(dentry);
+		dput(dentry);
+	}
+	if (inode && try_iput) {
+		ii_write_unlock(inode);
+		iput(inode);
+	}
+
+	ii_write_lock_parent(a->dir);
+	args.flags = a->flags[AuHn_PARENT];
+	args.dentry = NULL;
+	args.inode = a->dir;
+	args.h_inode = a->h_dir;
+	args.dir = NULL;
+	args.h_dir = NULL;
+	args.h_name = NULL;
+	args.h_nlen = 0;
+	err = hn_job(&args);
+	ii_write_unlock(a->dir);
+
+out:
+	iput(a->h_child_inode);
+	iput(a->h_dir);
+	iput(a->dir);
+	si_write_unlock(sb);
+	au_nwt_done(&sbinfo->si_nowait);
+	kfree(a);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask,
+	       struct qstr *h_child_qstr, struct inode *h_child_inode)
+{
+	int err, len;
+	unsigned int flags[AuHnLast], f;
+	unsigned char isdir, isroot, wh;
+	struct inode *dir;
+	struct au_hnotify_args *args;
+	char *p, *h_child_name;
+
+	err = 0;
+	AuDebugOn(!hnotify || !hnotify->hn_aufs_inode);
+	dir = igrab(hnotify->hn_aufs_inode);
+	if (!dir)
+		goto out;
+
+	isroot = (dir->i_ino == AUFS_ROOT_INO);
+	wh = 0;
+	h_child_name = (void *)h_child_qstr->name;
+	len = h_child_qstr->len;
+	if (h_child_name) {
+		if (len > AUFS_WH_PFX_LEN
+		    && !memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
+			h_child_name += AUFS_WH_PFX_LEN;
+			len -= AUFS_WH_PFX_LEN;
+			wh = 1;
+		}
+	}
+
+	isdir = 0;
+	if (h_child_inode)
+		isdir = !!S_ISDIR(h_child_inode->i_mode);
+	flags[AuHn_PARENT] = AuHnJob_ISDIR;
+	flags[AuHn_CHILD] = 0;
+	if (isdir)
+		flags[AuHn_CHILD] = AuHnJob_ISDIR;
+	au_fset_hnjob(flags[AuHn_PARENT], DIRENT);
+	au_fset_hnjob(flags[AuHn_CHILD], GEN);
+	switch (mask & FS_EVENTS_POSS_ON_CHILD) {
+	case FS_MOVED_FROM:
+	case FS_MOVED_TO:
+		au_fset_hnjob(flags[AuHn_CHILD], XINO0);
+		au_fset_hnjob(flags[AuHn_CHILD], MNTPNT);
+		/*FALLTHROUGH*/
+	case FS_CREATE:
+		AuDebugOn(!h_child_name);
+		break;
+
+	case FS_DELETE:
+		/*
+		 * aufs never be able to get this child inode.
+		 * revalidation should be in d_revalidate()
+		 * by checking i_nlink, i_generation or d_unhashed().
+		 */
+		AuDebugOn(!h_child_name);
+		au_fset_hnjob(flags[AuHn_CHILD], TRYXINO0);
+		au_fset_hnjob(flags[AuHn_CHILD], MNTPNT);
+		break;
+
+	default:
+		AuDebugOn(1);
+	}
+
+	if (wh)
+		h_child_inode = NULL;
+
+	err = -ENOMEM;
+	/* iput() and kfree() will be called in au_hnotify() */
+	args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS);
+	if (unlikely(!args)) {
+		AuErr1("no memory\n");
+		iput(dir);
+		goto out;
+	}
+	args->flags[AuHn_PARENT] = flags[AuHn_PARENT];
+	args->flags[AuHn_CHILD] = flags[AuHn_CHILD];
+	args->mask = mask;
+	args->dir = dir;
+	args->h_dir = igrab(h_dir);
+	if (h_child_inode)
+		h_child_inode = igrab(h_child_inode); /* can be NULL */
+	args->h_child_inode = h_child_inode;
+	args->h_child_nlen = len;
+	if (len) {
+		p = (void *)args;
+		p += sizeof(*args);
+		memcpy(p, h_child_name, len);
+		p[len] = 0;
+	}
+
+	/* NFS fires the event for silly-renamed one from kworker */
+	f = 0;
+	if (!dir->i_nlink
+	    || (au_test_nfs(h_dir->i_sb) && (mask & FS_DELETE)))
+		f = AuWkq_NEST;
+	err = au_wkq_nowait(au_hn_bh, args, dir->i_sb, f);
+	if (unlikely(err)) {
+		pr_err("wkq %d\n", err);
+		iput(args->h_child_inode);
+		iput(args->h_dir);
+		iput(args->dir);
+		kfree(args);
+	}
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm)
+{
+	int err;
+
+	AuDebugOn(!(udba & AuOptMask_UDBA));
+
+	err = 0;
+	if (au_hnotify_op.reset_br)
+		err = au_hnotify_op.reset_br(udba, br, perm);
+
+	return err;
+}
+
+int au_hnotify_init_br(struct au_branch *br, int perm)
+{
+	int err;
+
+	err = 0;
+	if (au_hnotify_op.init_br)
+		err = au_hnotify_op.init_br(br, perm);
+
+	return err;
+}
+
+void au_hnotify_fin_br(struct au_branch *br)
+{
+	if (au_hnotify_op.fin_br)
+		au_hnotify_op.fin_br(br);
+}
+
+static void au_hn_destroy_cache(void)
+{
+	kmem_cache_destroy(au_cachep[AuCache_HNOTIFY]);
+	au_cachep[AuCache_HNOTIFY] = NULL;
+}
+
+int __init au_hnotify_init(void)
+{
+	int err;
+
+	err = -ENOMEM;
+	au_cachep[AuCache_HNOTIFY] = AuCache(au_hnotify);
+	if (au_cachep[AuCache_HNOTIFY]) {
+		err = 0;
+		if (au_hnotify_op.init)
+			err = au_hnotify_op.init();
+		if (unlikely(err))
+			au_hn_destroy_cache();
+	}
+	AuTraceErr(err);
+	return err;
+}
+
+void au_hnotify_fin(void)
+{
+	if (au_hnotify_op.fin)
+		au_hnotify_op.fin();
+	/* cf. au_cache_fin() */
+	if (au_cachep[AuCache_HNOTIFY])
+		au_hn_destroy_cache();
+}
diff --git b/fs/aufs/i_op.c b/fs/aufs/i_op.c
new file mode 100644
index 0000000..a70d5b0
--- /dev/null
+++ b/fs/aufs/i_op.c
@@ -0,0 +1,1406 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode operations (except add/del/rename)
+ */
+
+#include <linux/device_cgroup.h>
+#include <linux/fs_stack.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include "aufs.h"
+
+static int h_permission(struct inode *h_inode, int mask,
+			struct vfsmount *h_mnt, int brperm)
+{
+	int err;
+	const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND));
+
+	err = -EACCES;
+	if ((write_mask && IS_IMMUTABLE(h_inode))
+	    || ((mask & MAY_EXEC)
+		&& S_ISREG(h_inode->i_mode)
+		&& ((h_mnt->mnt_flags & MNT_NOEXEC)
+		    || !(h_inode->i_mode & S_IXUGO))))
+		goto out;
+
+	/*
+	 * - skip the lower fs test in the case of write to ro branch.
+	 * - nfs dir permission write check is optimized, but a policy for
+	 *   link/rename requires a real check.
+	 * - nfs always sets MS_POSIXACL regardless its mount option 'noacl.'
+	 *   in this case, generic_permission() returns -EOPNOTSUPP.
+	 */
+	if ((write_mask && !au_br_writable(brperm))
+	    || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode)
+		&& write_mask && !(mask & MAY_READ))
+	    || !h_inode->i_op->permission) {
+		/* AuLabel(generic_permission); */
+		/* AuDbg("get_acl %pf\n", h_inode->i_op->get_acl); */
+		err = generic_permission(h_inode, mask);
+		if (err == -EOPNOTSUPP && au_test_nfs_noacl(h_inode))
+			err = h_inode->i_op->permission(h_inode, mask);
+		AuTraceErr(err);
+	} else {
+		/* AuLabel(h_inode->permission); */
+		err = h_inode->i_op->permission(h_inode, mask);
+		AuTraceErr(err);
+	}
+
+	if (!err)
+		err = devcgroup_inode_permission(h_inode, mask);
+	if (!err)
+		err = security_inode_permission(h_inode, mask);
+
+#if 0
+	if (!err) {
+		/* todo: do we need to call ima_path_check()? */
+		struct path h_path = {
+			.dentry	=
+			.mnt	= h_mnt
+		};
+		err = ima_path_check(&h_path,
+				     mask & (MAY_READ | MAY_WRITE | MAY_EXEC),
+				     IMA_COUNT_LEAVE);
+	}
+#endif
+
+out:
+	return err;
+}
+
+static int aufs_permission(struct inode *inode, int mask)
+{
+	int err;
+	aufs_bindex_t bindex, bend;
+	const unsigned char isdir = !!S_ISDIR(inode->i_mode),
+		write_mask = !!(mask & (MAY_WRITE | MAY_APPEND));
+	struct inode *h_inode;
+	struct super_block *sb;
+	struct au_branch *br;
+
+	/* todo: support rcu-walk? */
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	ii_read_lock_child(inode);
+#if 0
+	err = au_iigen_test(inode, au_sigen(sb));
+	if (unlikely(err))
+		goto out;
+#endif
+
+	if (!isdir
+	    || write_mask
+	    || au_opt_test(au_mntflags(sb), DIRPERM1)) {
+		err = au_busy_or_stale();
+		h_inode = au_h_iptr(inode, au_ibstart(inode));
+		if (unlikely(!h_inode
+			     || (h_inode->i_mode & S_IFMT)
+			     != (inode->i_mode & S_IFMT)))
+			goto out;
+
+		err = 0;
+		bindex = au_ibstart(inode);
+		br = au_sbr(sb, bindex);
+		err = h_permission(h_inode, mask, au_br_mnt(br), br->br_perm);
+		if (write_mask
+		    && !err
+		    && !special_file(h_inode->i_mode)) {
+			/* test whether the upper writable branch exists */
+			err = -EROFS;
+			for (; bindex >= 0; bindex--)
+				if (!au_br_rdonly(au_sbr(sb, bindex))) {
+					err = 0;
+					break;
+				}
+		}
+		goto out;
+	}
+
+	/* non-write to dir */
+	err = 0;
+	bend = au_ibend(inode);
+	for (bindex = au_ibstart(inode); !err && bindex <= bend; bindex++) {
+		h_inode = au_h_iptr(inode, bindex);
+		if (h_inode) {
+			err = au_busy_or_stale();
+			if (unlikely(!S_ISDIR(h_inode->i_mode)))
+				break;
+
+			br = au_sbr(sb, bindex);
+			err = h_permission(h_inode, mask, au_br_mnt(br),
+					   br->br_perm);
+		}
+	}
+
+out:
+	ii_read_unlock(inode);
+	si_read_unlock(sb);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry,
+				  unsigned int flags)
+{
+	struct dentry *ret, *parent;
+	struct inode *inode;
+	struct super_block *sb;
+	int err, npositive;
+
+	IMustLock(dir);
+
+	/* todo: support rcu-walk? */
+	ret = ERR_PTR(-ECHILD);
+	if (flags & LOOKUP_RCU)
+		goto out;
+
+	ret = ERR_PTR(-ENAMETOOLONG);
+	if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
+		goto out;
+
+	sb = dir->i_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	ret = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	err = au_di_init(dentry);
+	ret = ERR_PTR(err);
+	if (unlikely(err))
+		goto out_si;
+
+	inode = NULL;
+	npositive = 0; /* suppress a warning */
+	parent = dentry->d_parent; /* dir inode is locked */
+	di_read_lock_parent(parent, AuLock_IR);
+	err = au_alive_dir(parent);
+	if (!err)
+		err = au_digen_test(parent, au_sigen(sb));
+	if (!err) {
+		npositive = au_lkup_dentry(dentry, au_dbstart(parent),
+					   /*type*/0);
+		err = npositive;
+	}
+	di_read_unlock(parent, AuLock_IR);
+	ret = ERR_PTR(err);
+	if (unlikely(err < 0))
+		goto out_unlock;
+
+	if (npositive) {
+		inode = au_new_inode(dentry, /*must_new*/0);
+		if (IS_ERR(inode)) {
+			ret = (void *)inode;
+			inode = NULL;
+			goto out_unlock;
+		}
+	}
+
+	if (inode)
+		atomic_inc(&inode->i_count);
+	ret = d_splice_alias(inode, dentry);
+#if 0
+	if (unlikely(d_need_lookup(dentry))) {
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
+		spin_unlock(&dentry->d_lock);
+	} else
+#endif
+	if (inode) {
+		if (!IS_ERR(ret)) {
+			iput(inode);
+			if (ret && ret != dentry)
+				ii_write_unlock(inode);
+		} else {
+			ii_write_unlock(inode);
+			iput(inode);
+			inode = NULL;
+		}
+	}
+
+out_unlock:
+	di_write_unlock(dentry);
+	if (inode) {
+		/* verbose coding for lock class name */
+		if (unlikely(S_ISLNK(inode->i_mode)))
+			au_rw_class(&au_di(dentry)->di_rwsem,
+				    au_lc_key + AuLcSymlink_DIINFO);
+		else if (unlikely(S_ISDIR(inode->i_mode)))
+			au_rw_class(&au_di(dentry)->di_rwsem,
+				    au_lc_key + AuLcDir_DIINFO);
+		else /* likely */
+			au_rw_class(&au_di(dentry)->di_rwsem,
+				    au_lc_key + AuLcNonDir_DIINFO);
+	}
+out_si:
+	si_read_unlock(sb);
+out:
+	return ret;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct aopen_node {
+	struct hlist_node hlist;
+	struct file *file, *h_file;
+};
+
+static int au_do_aopen(struct inode *inode, struct file *file)
+{
+	struct au_sphlhead *aopen;
+	struct aopen_node *node;
+	struct au_do_open_args args = {
+		.no_lock	= 1,
+		.open		= au_do_open_nondir
+	};
+
+	aopen = &au_sbi(inode->i_sb)->si_aopen;
+	spin_lock(&aopen->spin);
+	hlist_for_each_entry(node, &aopen->head, hlist)
+		if (node->file == file) {
+			args.h_file = node->h_file;
+			break;
+		}
+	spin_unlock(&aopen->spin);
+	/* AuDebugOn(!args.h_file); */
+
+	return au_do_open(file, &args);
+}
+
+static int aufs_atomic_open(struct inode *dir, struct dentry *dentry,
+			    struct file *file, unsigned int open_flag,
+			    umode_t create_mode, int *opened)
+{
+	int err, h_opened = *opened;
+	struct dentry *parent;
+	struct dentry *d;
+	struct au_sphlhead *aopen;
+	struct vfsub_aopen_args args = {
+		.open_flag	= open_flag,
+		.create_mode	= create_mode,
+		.opened		= &h_opened
+	};
+	struct aopen_node aopen_node = {
+		.file	= file
+	};
+
+	IMustLock(dir);
+	AuDbg("open_flag 0x%x\n", open_flag);
+	AuDbgDentry(dentry);
+
+	err = 0;
+	if (!au_di(dentry)) {
+		d = aufs_lookup(dir, dentry, /*flags*/0);
+		if (IS_ERR(d)) {
+			err = PTR_ERR(d);
+			goto out;
+		} else if (d) {
+			/*
+			 * obsoleted dentry found.
+			 * another error will be returned later.
+			 */
+			d_drop(d);
+			dput(d);
+			AuDbgDentry(d);
+		}
+		AuDbgDentry(dentry);
+	}
+
+	if (d_is_positive(dentry)
+	    || d_unhashed(dentry)
+	    || d_unlinked(dentry)
+	    || !(open_flag & O_CREAT))
+		goto out_no_open;
+
+	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN);
+	if (unlikely(err))
+		goto out;
+
+	parent = dentry->d_parent;	/* dir is locked */
+	di_write_lock_parent(parent);
+	err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0);
+	if (unlikely(err))
+		goto out_unlock;
+
+	AuDbgDentry(dentry);
+	if (d_is_positive(dentry))
+		goto out_unlock;
+
+	args.file = get_empty_filp();
+	err = PTR_ERR(args.file);
+	if (IS_ERR(args.file))
+		goto out_unlock;
+
+	args.file->f_flags = file->f_flags;
+	err = au_aopen_or_create(dir, dentry, &args);
+	AuTraceErr(err);
+	AuDbgFile(args.file);
+	if (unlikely(err < 0)) {
+		if (h_opened & FILE_OPENED)
+			fput(args.file);
+		else
+			put_filp(args.file);
+		goto out_unlock;
+	}
+
+	/* some filesystems don't set FILE_CREATED while succeeded? */
+	*opened |= FILE_CREATED;
+	if (h_opened & FILE_OPENED)
+		aopen_node.h_file = args.file;
+	else {
+		put_filp(args.file);
+		args.file = NULL;
+	}
+	aopen = &au_sbi(dir->i_sb)->si_aopen;
+	au_sphl_add(&aopen_node.hlist, aopen);
+	err = finish_open(file, dentry, au_do_aopen, opened);
+	au_sphl_del(&aopen_node.hlist, aopen);
+	AuTraceErr(err);
+	AuDbgFile(file);
+	if (aopen_node.h_file)
+		fput(aopen_node.h_file);
+
+out_unlock:
+	di_write_unlock(parent);
+	aufs_read_unlock(dentry, AuLock_DW);
+	AuDbgDentry(dentry);
+	if (unlikely(err))
+		goto out;
+out_no_open:
+	if (!err && !(*opened & FILE_CREATED)) {
+		AuLabel(out_no_open);
+		dget(dentry);
+		err = finish_no_open(file, dentry);
+	}
+out:
+	AuDbg("%pd%s%s\n", dentry,
+	      (*opened & FILE_CREATED) ? " created" : "",
+	      (*opened & FILE_OPENED) ? " opened" : "");
+	AuTraceErr(err);
+	return err;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent,
+			  const unsigned char add_entry, aufs_bindex_t bcpup,
+			  aufs_bindex_t bstart)
+{
+	int err;
+	struct dentry *h_parent;
+	struct inode *h_dir;
+
+	if (add_entry)
+		IMustLock(d_inode(parent));
+	else
+		di_write_lock_parent(parent);
+
+	err = 0;
+	if (!au_h_dptr(parent, bcpup)) {
+		if (bstart > bcpup)
+			err = au_cpup_dirs(dentry, bcpup);
+		else if (bstart < bcpup)
+			err = au_cpdown_dirs(dentry, bcpup);
+		else
+			BUG();
+	}
+	if (!err && add_entry && !au_ftest_wrdir(add_entry, TMPFILE)) {
+		h_parent = au_h_dptr(parent, bcpup);
+		h_dir = d_inode(h_parent);
+		inode_lock_nested(h_dir, AuLsc_I_PARENT);
+		err = au_lkup_neg(dentry, bcpup, /*wh*/0);
+		/* todo: no unlock here */
+		inode_unlock(h_dir);
+
+		AuDbg("bcpup %d\n", bcpup);
+		if (!err) {
+			if (d_really_is_negative(dentry))
+				au_set_h_dptr(dentry, bstart, NULL);
+			au_update_dbrange(dentry, /*do_put_zero*/0);
+		}
+	}
+
+	if (!add_entry)
+		di_write_unlock(parent);
+	if (!err)
+		err = bcpup; /* success */
+
+	AuTraceErr(err);
+	return err;
+}
+
+/*
+ * decide the branch and the parent dir where we will create a new entry.
+ * returns new bindex or an error.
+ * copyup the parent dir if needed.
+ */
+int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry,
+	      struct au_wr_dir_args *args)
+{
+	int err;
+	unsigned int flags;
+	aufs_bindex_t bcpup, bstart, src_bstart;
+	const unsigned char add_entry
+		= au_ftest_wrdir(args->flags, ADD_ENTRY)
+		| au_ftest_wrdir(args->flags, TMPFILE);
+	struct super_block *sb;
+	struct dentry *parent;
+	struct au_sbinfo *sbinfo;
+
+	sb = dentry->d_sb;
+	sbinfo = au_sbi(sb);
+	parent = dget_parent(dentry);
+	bstart = au_dbstart(dentry);
+	bcpup = bstart;
+	if (args->force_btgt < 0) {
+		if (src_dentry) {
+			src_bstart = au_dbstart(src_dentry);
+			if (src_bstart < bstart)
+				bcpup = src_bstart;
+		} else if (add_entry) {
+			flags = 0;
+			if (au_ftest_wrdir(args->flags, ISDIR))
+				au_fset_wbr(flags, DIR);
+			err = AuWbrCreate(sbinfo, dentry, flags);
+			bcpup = err;
+		}
+
+		if (bcpup < 0 || au_test_ro(sb, bcpup, d_inode(dentry))) {
+			if (add_entry)
+				err = AuWbrCopyup(sbinfo, dentry);
+			else {
+				if (!IS_ROOT(dentry)) {
+					di_read_lock_parent(parent, !AuLock_IR);
+					err = AuWbrCopyup(sbinfo, dentry);
+					di_read_unlock(parent, !AuLock_IR);
+				} else
+					err = AuWbrCopyup(sbinfo, dentry);
+			}
+			bcpup = err;
+			if (unlikely(err < 0))
+				goto out;
+		}
+	} else {
+		bcpup = args->force_btgt;
+		AuDebugOn(au_test_ro(sb, bcpup, d_inode(dentry)));
+	}
+
+	AuDbg("bstart %d, bcpup %d\n", bstart, bcpup);
+	err = bcpup;
+	if (bcpup == bstart)
+		goto out; /* success */
+
+	/* copyup the new parent into the branch we process */
+	err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, bstart);
+	if (err >= 0) {
+		if (d_really_is_negative(dentry)) {
+			au_set_h_dptr(dentry, bstart, NULL);
+			au_set_dbstart(dentry, bcpup);
+			au_set_dbend(dentry, bcpup);
+		}
+		AuDebugOn(add_entry
+			  && !au_ftest_wrdir(args->flags, TMPFILE)
+			  && !au_h_dptr(dentry, bcpup));
+	}
+
+out:
+	dput(parent);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_pin_hdir_unlock(struct au_pin *p)
+{
+	if (p->hdir)
+		au_hn_imtx_unlock(p->hdir);
+}
+
+int au_pin_hdir_lock(struct au_pin *p)
+{
+	int err;
+
+	err = 0;
+	if (!p->hdir)
+		goto out;
+
+	/* even if an error happens later, keep this lock */
+	au_hn_imtx_lock_nested(p->hdir, p->lsc_hi);
+
+	err = -EBUSY;
+	if (unlikely(p->hdir->hi_inode != d_inode(p->h_parent)))
+		goto out;
+
+	err = 0;
+	if (p->h_dentry)
+		err = au_h_verify(p->h_dentry, p->udba, p->hdir->hi_inode,
+				  p->h_parent, p->br);
+
+out:
+	return err;
+}
+
+int au_pin_hdir_relock(struct au_pin *p)
+{
+	int err, i;
+	struct inode *h_i;
+	struct dentry *h_d[] = {
+		p->h_dentry,
+		p->h_parent
+	};
+
+	err = au_pin_hdir_lock(p);
+	if (unlikely(err))
+		goto out;
+
+	for (i = 0; !err && i < sizeof(h_d)/sizeof(*h_d); i++) {
+		if (!h_d[i])
+			continue;
+		if (d_is_positive(h_d[i])) {
+			h_i = d_inode(h_d[i]);
+			err = !h_i->i_nlink;
+		}
+	}
+
+out:
+	return err;
+}
+
+void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task)
+{
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
+	p->hdir->hi_inode->i_mutex.owner = task;
+#endif
+}
+
+void au_pin_hdir_acquire_nest(struct au_pin *p)
+{
+	if (p->hdir) {
+		mutex_acquire_nest(&p->hdir->hi_inode->i_mutex.dep_map,
+				   p->lsc_hi, 0, NULL, _RET_IP_);
+		au_pin_hdir_set_owner(p, current);
+	}
+}
+
+void au_pin_hdir_release(struct au_pin *p)
+{
+	if (p->hdir) {
+		au_pin_hdir_set_owner(p, p->task);
+		mutex_release(&p->hdir->hi_inode->i_mutex.dep_map, 1, _RET_IP_);
+	}
+}
+
+struct dentry *au_pinned_h_parent(struct au_pin *pin)
+{
+	if (pin && pin->parent)
+		return au_h_dptr(pin->parent, pin->bindex);
+	return NULL;
+}
+
+void au_unpin(struct au_pin *p)
+{
+	if (p->hdir)
+		au_pin_hdir_unlock(p);
+	if (p->h_mnt && au_ftest_pin(p->flags, MNT_WRITE))
+		vfsub_mnt_drop_write(p->h_mnt);
+	if (!p->hdir)
+		return;
+
+	if (!au_ftest_pin(p->flags, DI_LOCKED))
+		di_read_unlock(p->parent, AuLock_IR);
+	iput(p->hdir->hi_inode);
+	dput(p->parent);
+	p->parent = NULL;
+	p->hdir = NULL;
+	p->h_mnt = NULL;
+	/* do not clear p->task */
+}
+
+int au_do_pin(struct au_pin *p)
+{
+	int err;
+	struct super_block *sb;
+	struct inode *h_dir;
+
+	err = 0;
+	sb = p->dentry->d_sb;
+	p->br = au_sbr(sb, p->bindex);
+	if (IS_ROOT(p->dentry)) {
+		if (au_ftest_pin(p->flags, MNT_WRITE)) {
+			p->h_mnt = au_br_mnt(p->br);
+			err = vfsub_mnt_want_write(p->h_mnt);
+			if (unlikely(err)) {
+				au_fclr_pin(p->flags, MNT_WRITE);
+				goto out_err;
+			}
+		}
+		goto out;
+	}
+
+	p->h_dentry = NULL;
+	if (p->bindex <= au_dbend(p->dentry))
+		p->h_dentry = au_h_dptr(p->dentry, p->bindex);
+
+	p->parent = dget_parent(p->dentry);
+	if (!au_ftest_pin(p->flags, DI_LOCKED))
+		di_read_lock(p->parent, AuLock_IR, p->lsc_di);
+
+	h_dir = NULL;
+	p->h_parent = au_h_dptr(p->parent, p->bindex);
+	p->hdir = au_hi(d_inode(p->parent), p->bindex);
+	if (p->hdir)
+		h_dir = p->hdir->hi_inode;
+
+	/*
+	 * udba case, or
+	 * if DI_LOCKED is not set, then p->parent may be different
+	 * and h_parent can be NULL.
+	 */
+	if (unlikely(!p->hdir || !h_dir || !p->h_parent)) {
+		err = -EBUSY;
+		if (!au_ftest_pin(p->flags, DI_LOCKED))
+			di_read_unlock(p->parent, AuLock_IR);
+		dput(p->parent);
+		p->parent = NULL;
+		goto out_err;
+	}
+
+	if (au_ftest_pin(p->flags, MNT_WRITE)) {
+		p->h_mnt = au_br_mnt(p->br);
+		err = vfsub_mnt_want_write(p->h_mnt);
+		if (unlikely(err)) {
+			au_fclr_pin(p->flags, MNT_WRITE);
+			if (!au_ftest_pin(p->flags, DI_LOCKED))
+				di_read_unlock(p->parent, AuLock_IR);
+			dput(p->parent);
+			p->parent = NULL;
+			goto out_err;
+		}
+	}
+
+	au_igrab(h_dir);
+	err = au_pin_hdir_lock(p);
+	if (!err)
+		goto out; /* success */
+
+	au_unpin(p);
+
+out_err:
+	pr_err("err %d\n", err);
+	err = au_busy_or_stale();
+out:
+	return err;
+}
+
+void au_pin_init(struct au_pin *p, struct dentry *dentry,
+		 aufs_bindex_t bindex, int lsc_di, int lsc_hi,
+		 unsigned int udba, unsigned char flags)
+{
+	p->dentry = dentry;
+	p->udba = udba;
+	p->lsc_di = lsc_di;
+	p->lsc_hi = lsc_hi;
+	p->flags = flags;
+	p->bindex = bindex;
+
+	p->parent = NULL;
+	p->hdir = NULL;
+	p->h_mnt = NULL;
+
+	p->h_dentry = NULL;
+	p->h_parent = NULL;
+	p->br = NULL;
+	p->task = current;
+}
+
+int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex,
+	   unsigned int udba, unsigned char flags)
+{
+	au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2,
+		    udba, flags);
+	return au_do_pin(pin);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * ->setattr() and ->getattr() are called in various cases.
+ * chmod, stat: dentry is revalidated.
+ * fchmod, fstat: file and dentry are not revalidated, additionally they may be
+ *		  unhashed.
+ * for ->setattr(), ia->ia_file is passed from ftruncate only.
+ */
+/* todo: consolidate with do_refresh() and simple_reval_dpath() */
+int au_reval_for_attr(struct dentry *dentry, unsigned int sigen)
+{
+	int err;
+	struct dentry *parent;
+
+	err = 0;
+	if (au_digen_test(dentry, sigen)) {
+		parent = dget_parent(dentry);
+		di_read_lock_parent(parent, AuLock_IR);
+		err = au_refresh_dentry(dentry, parent);
+		di_read_unlock(parent, AuLock_IR);
+		dput(parent);
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia,
+		     struct au_icpup_args *a)
+{
+	int err;
+	loff_t sz;
+	aufs_bindex_t bstart, ibstart;
+	struct dentry *hi_wh, *parent;
+	struct inode *inode;
+	struct au_wr_dir_args wr_dir_args = {
+		.force_btgt	= -1,
+		.flags		= 0
+	};
+
+	if (d_is_dir(dentry))
+		au_fset_wrdir(wr_dir_args.flags, ISDIR);
+	/* plink or hi_wh() case */
+	bstart = au_dbstart(dentry);
+	inode = d_inode(dentry);
+	ibstart = au_ibstart(inode);
+	if (bstart != ibstart && !au_test_ro(inode->i_sb, ibstart, inode))
+		wr_dir_args.force_btgt = ibstart;
+	err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args);
+	if (unlikely(err < 0))
+		goto out;
+	a->btgt = err;
+	if (err != bstart)
+		au_fset_icpup(a->flags, DID_CPUP);
+
+	err = 0;
+	a->pin_flags = AuPin_MNT_WRITE;
+	parent = NULL;
+	if (!IS_ROOT(dentry)) {
+		au_fset_pin(a->pin_flags, DI_LOCKED);
+		parent = dget_parent(dentry);
+		di_write_lock_parent(parent);
+	}
+
+	err = au_pin(&a->pin, dentry, a->btgt, a->udba, a->pin_flags);
+	if (unlikely(err))
+		goto out_parent;
+
+	a->h_path.dentry = au_h_dptr(dentry, bstart);
+	sz = -1;
+	a->h_inode = d_inode(a->h_path.dentry);
+	if (ia && (ia->ia_valid & ATTR_SIZE)) {
+		inode_lock_nested(a->h_inode, AuLsc_I_CHILD);
+		if (ia->ia_size < i_size_read(a->h_inode))
+			sz = ia->ia_size;
+		inode_unlock(a->h_inode);
+	}
+
+	hi_wh = NULL;
+	if (au_ftest_icpup(a->flags, DID_CPUP) && d_unlinked(dentry)) {
+		hi_wh = au_hi_wh(inode, a->btgt);
+		if (!hi_wh) {
+			struct au_cp_generic cpg = {
+				.dentry	= dentry,
+				.bdst	= a->btgt,
+				.bsrc	= -1,
+				.len	= sz,
+				.pin	= &a->pin
+			};
+			err = au_sio_cpup_wh(&cpg, /*file*/NULL);
+			if (unlikely(err))
+				goto out_unlock;
+			hi_wh = au_hi_wh(inode, a->btgt);
+			/* todo: revalidate hi_wh? */
+		}
+	}
+
+	if (parent) {
+		au_pin_set_parent_lflag(&a->pin, /*lflag*/0);
+		di_downgrade_lock(parent, AuLock_IR);
+		dput(parent);
+		parent = NULL;
+	}
+	if (!au_ftest_icpup(a->flags, DID_CPUP))
+		goto out; /* success */
+
+	if (!d_unhashed(dentry)) {
+		struct au_cp_generic cpg = {
+			.dentry	= dentry,
+			.bdst	= a->btgt,
+			.bsrc	= bstart,
+			.len	= sz,
+			.pin	= &a->pin,
+			.flags	= AuCpup_DTIME | AuCpup_HOPEN
+		};
+		err = au_sio_cpup_simple(&cpg);
+		if (!err)
+			a->h_path.dentry = au_h_dptr(dentry, a->btgt);
+	} else if (!hi_wh)
+		a->h_path.dentry = au_h_dptr(dentry, a->btgt);
+	else
+		a->h_path.dentry = hi_wh; /* do not dget here */
+
+out_unlock:
+	a->h_inode = d_inode(a->h_path.dentry);
+	if (!err)
+		goto out; /* success */
+	au_unpin(&a->pin);
+out_parent:
+	if (parent) {
+		di_write_unlock(parent);
+		dput(parent);
+	}
+out:
+	if (!err)
+		inode_lock_nested(a->h_inode, AuLsc_I_CHILD);
+	return err;
+}
+
+static int aufs_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	int err;
+	struct inode *inode, *delegated;
+	struct super_block *sb;
+	struct file *file;
+	struct au_icpup_args *a;
+
+	inode = d_inode(dentry);
+	IMustLock(inode);
+
+	err = -ENOMEM;
+	a = kzalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+		ia->ia_valid &= ~ATTR_MODE;
+
+	file = NULL;
+	sb = dentry->d_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out_kfree;
+
+	if (ia->ia_valid & ATTR_FILE) {
+		/* currently ftruncate(2) only */
+		AuDebugOn(!d_is_reg(dentry));
+		file = ia->ia_file;
+		err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
+		if (unlikely(err))
+			goto out_si;
+		ia->ia_file = au_hf_top(file);
+		a->udba = AuOpt_UDBA_NONE;
+	} else {
+		/* fchmod() doesn't pass ia_file */
+		a->udba = au_opt_udba(sb);
+		di_write_lock_child(dentry);
+		/* no d_unlinked(), to set UDBA_NONE for root */
+		if (d_unhashed(dentry))
+			a->udba = AuOpt_UDBA_NONE;
+		if (a->udba != AuOpt_UDBA_NONE) {
+			AuDebugOn(IS_ROOT(dentry));
+			err = au_reval_for_attr(dentry, au_sigen(sb));
+			if (unlikely(err))
+				goto out_dentry;
+		}
+	}
+
+	err = au_pin_and_icpup(dentry, ia, a);
+	if (unlikely(err < 0))
+		goto out_dentry;
+	if (au_ftest_icpup(a->flags, DID_CPUP)) {
+		ia->ia_file = NULL;
+		ia->ia_valid &= ~ATTR_FILE;
+	}
+
+	a->h_path.mnt = au_sbr_mnt(sb, a->btgt);
+	if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME))
+	    == (ATTR_MODE | ATTR_CTIME)) {
+		err = security_path_chmod(&a->h_path, ia->ia_mode);
+		if (unlikely(err))
+			goto out_unlock;
+	} else if ((ia->ia_valid & (ATTR_UID | ATTR_GID))
+		   && (ia->ia_valid & ATTR_CTIME)) {
+		err = security_path_chown(&a->h_path, ia->ia_uid, ia->ia_gid);
+		if (unlikely(err))
+			goto out_unlock;
+	}
+
+	if (ia->ia_valid & ATTR_SIZE) {
+		struct file *f;
+
+		if (ia->ia_size < i_size_read(inode))
+			/* unmap only */
+			truncate_setsize(inode, ia->ia_size);
+
+		f = NULL;
+		if (ia->ia_valid & ATTR_FILE)
+			f = ia->ia_file;
+		inode_unlock(a->h_inode);
+		err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f);
+		inode_lock_nested(a->h_inode, AuLsc_I_CHILD);
+	} else {
+		delegated = NULL;
+		while (1) {
+			err = vfsub_notify_change(&a->h_path, ia, &delegated);
+			if (delegated) {
+				err = break_deleg_wait(&delegated);
+				if (!err)
+					continue;
+			}
+			break;
+		}
+	}
+	/*
+	 * regardless aufs 'acl' option setting.
+	 * why don't all acl-aware fs call this func from their ->setattr()?
+	 */
+	if (!err && (ia->ia_valid & ATTR_MODE))
+		err = vfsub_acl_chmod(a->h_inode, ia->ia_mode);
+	if (!err)
+		au_cpup_attr_changeable(inode);
+
+out_unlock:
+	inode_unlock(a->h_inode);
+	au_unpin(&a->pin);
+	if (unlikely(err))
+		au_update_dbstart(dentry);
+out_dentry:
+	di_write_unlock(dentry);
+	if (file) {
+		fi_write_unlock(file);
+		ia->ia_file = file;
+		ia->ia_valid |= ATTR_FILE;
+	}
+out_si:
+	si_read_unlock(sb);
+out_kfree:
+	kfree(a);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL)
+static int au_h_path_to_set_attr(struct dentry *dentry,
+				 struct au_icpup_args *a, struct path *h_path)
+{
+	int err;
+	struct super_block *sb;
+
+	sb = dentry->d_sb;
+	a->udba = au_opt_udba(sb);
+	/* no d_unlinked(), to set UDBA_NONE for root */
+	if (d_unhashed(dentry))
+		a->udba = AuOpt_UDBA_NONE;
+	if (a->udba != AuOpt_UDBA_NONE) {
+		AuDebugOn(IS_ROOT(dentry));
+		err = au_reval_for_attr(dentry, au_sigen(sb));
+		if (unlikely(err))
+			goto out;
+	}
+	err = au_pin_and_icpup(dentry, /*ia*/NULL, a);
+	if (unlikely(err < 0))
+		goto out;
+
+	h_path->dentry = a->h_path.dentry;
+	h_path->mnt = au_sbr_mnt(sb, a->btgt);
+
+out:
+	return err;
+}
+
+ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg)
+{
+	int err;
+	struct path h_path;
+	struct super_block *sb;
+	struct au_icpup_args *a;
+	struct inode *inode, *h_inode;
+
+	inode = d_inode(dentry);
+	IMustLock(inode);
+
+	err = -ENOMEM;
+	a = kzalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	sb = dentry->d_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out_kfree;
+
+	h_path.dentry = NULL;	/* silence gcc */
+	di_write_lock_child(dentry);
+	err = au_h_path_to_set_attr(dentry, a, &h_path);
+	if (unlikely(err))
+		goto out_di;
+
+	inode_unlock(a->h_inode);
+	switch (arg->type) {
+	case AU_XATTR_SET:
+		err = vfsub_setxattr(h_path.dentry,
+				     arg->u.set.name, arg->u.set.value,
+				     arg->u.set.size, arg->u.set.flags);
+		break;
+	case AU_XATTR_REMOVE:
+		err = vfsub_removexattr(h_path.dentry, arg->u.remove.name);
+		break;
+	case AU_ACL_SET:
+		err = -EOPNOTSUPP;
+		h_inode = d_inode(h_path.dentry);
+		if (h_inode->i_op->set_acl)
+			err = h_inode->i_op->set_acl(h_inode,
+						     arg->u.acl_set.acl,
+						     arg->u.acl_set.type);
+		break;
+	}
+	if (!err)
+		au_cpup_attr_timesizes(inode);
+
+	au_unpin(&a->pin);
+	if (unlikely(err))
+		au_update_dbstart(dentry);
+
+out_di:
+	di_write_unlock(dentry);
+	si_read_unlock(sb);
+out_kfree:
+	kfree(a);
+out:
+	AuTraceErr(err);
+	return err;
+}
+#endif
+
+static void au_refresh_iattr(struct inode *inode, struct kstat *st,
+			     unsigned int nlink)
+{
+	unsigned int n;
+
+	inode->i_mode = st->mode;
+	/* don't i_[ug]id_write() here */
+	inode->i_uid = st->uid;
+	inode->i_gid = st->gid;
+	inode->i_atime = st->atime;
+	inode->i_mtime = st->mtime;
+	inode->i_ctime = st->ctime;
+
+	au_cpup_attr_nlink(inode, /*force*/0);
+	if (S_ISDIR(inode->i_mode)) {
+		n = inode->i_nlink;
+		n -= nlink;
+		n += st->nlink;
+		smp_mb(); /* for i_nlink */
+		/* 0 can happen */
+		set_nlink(inode, n);
+	}
+
+	spin_lock(&inode->i_lock);
+	inode->i_blocks = st->blocks;
+	i_size_write(inode, st->size);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * common routine for aufs_getattr() and aufs_getxattr().
+ * returns zero or negative (an error).
+ * @dentry will be read-locked in success.
+ */
+int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path)
+{
+	int err;
+	unsigned int mnt_flags, sigen;
+	unsigned char udba_none;
+	aufs_bindex_t bindex;
+	struct super_block *sb, *h_sb;
+	struct inode *inode;
+
+	h_path->mnt = NULL;
+	h_path->dentry = NULL;
+
+	err = 0;
+	sb = dentry->d_sb;
+	mnt_flags = au_mntflags(sb);
+	udba_none = !!au_opt_test(mnt_flags, UDBA_NONE);
+
+	/* support fstat(2) */
+	if (!d_unlinked(dentry) && !udba_none) {
+		sigen = au_sigen(sb);
+		err = au_digen_test(dentry, sigen);
+		if (!err) {
+			di_read_lock_child(dentry, AuLock_IR);
+			err = au_dbrange_test(dentry);
+			if (unlikely(err)) {
+				di_read_unlock(dentry, AuLock_IR);
+				goto out;
+			}
+		} else {
+			AuDebugOn(IS_ROOT(dentry));
+			di_write_lock_child(dentry);
+			err = au_dbrange_test(dentry);
+			if (!err)
+				err = au_reval_for_attr(dentry, sigen);
+			if (!err)
+				di_downgrade_lock(dentry, AuLock_IR);
+			else {
+				di_write_unlock(dentry);
+				goto out;
+			}
+		}
+	} else
+		di_read_lock_child(dentry, AuLock_IR);
+
+	inode = d_inode(dentry);
+	bindex = au_ibstart(inode);
+	h_path->mnt = au_sbr_mnt(sb, bindex);
+	h_sb = h_path->mnt->mnt_sb;
+	if (!force
+	    && !au_test_fs_bad_iattr(h_sb)
+	    && udba_none)
+		goto out; /* success */
+
+	if (au_dbstart(dentry) == bindex)
+		h_path->dentry = au_h_dptr(dentry, bindex);
+	else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) {
+		h_path->dentry = au_plink_lkup(inode, bindex);
+		if (IS_ERR(h_path->dentry))
+			/* pretending success */
+			h_path->dentry = NULL;
+		else
+			dput(h_path->dentry);
+	}
+
+out:
+	return err;
+}
+
+static int aufs_getattr(struct vfsmount *mnt __maybe_unused,
+			struct dentry *dentry, struct kstat *st)
+{
+	int err;
+	unsigned char positive;
+	struct path h_path;
+	struct inode *inode;
+	struct super_block *sb;
+
+	inode = d_inode(dentry);
+	sb = dentry->d_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out;
+	err = au_h_path_getattr(dentry, /*force*/0, &h_path);
+	if (unlikely(err))
+		goto out_si;
+	if (unlikely(!h_path.dentry))
+		/* illegally overlapped or something */
+		goto out_fill; /* pretending success */
+
+	positive = d_is_positive(h_path.dentry);
+	if (positive)
+		err = vfs_getattr(&h_path, st);
+	if (!err) {
+		if (positive)
+			au_refresh_iattr(inode, st,
+					 d_inode(h_path.dentry)->i_nlink);
+		goto out_fill; /* success */
+	}
+	AuTraceErr(err);
+	goto out_di;
+
+out_fill:
+	generic_fillattr(inode, st);
+out_di:
+	di_read_unlock(dentry, AuLock_IR);
+out_si:
+	si_read_unlock(sb);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static const char *aufs_get_link(struct dentry *dentry, struct inode *inode,
+				 struct delayed_call *done)
+{
+	const char *ret;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+	int err;
+	aufs_bindex_t bindex;
+
+	ret = NULL; /* suppress a warning */
+	err = -ECHILD;
+	if (!dentry)
+		goto out;
+
+	err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN);
+	if (unlikely(err))
+		goto out;
+
+	err = au_d_hashed_positive(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+
+	err = -EINVAL;
+	inode = d_inode(dentry);
+	bindex = au_ibstart(inode);
+	h_inode = au_h_iptr(inode, bindex);
+	if (unlikely(!h_inode->i_op->get_link))
+		goto out_unlock;
+
+	err = -EBUSY;
+	h_dentry = NULL;
+	if (au_dbstart(dentry) <= bindex) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (h_dentry)
+			dget(h_dentry);
+	}
+	if (!h_dentry) {
+		h_dentry = d_find_any_alias(h_inode);
+		if (IS_ERR(h_dentry)) {
+			err = PTR_ERR(h_dentry);
+			goto out_unlock;
+		}
+	}
+	if (unlikely(!h_dentry))
+		goto out_unlock;
+
+	err = 0;
+	AuDbg("%pf\n", h_inode->i_op->get_link);
+	AuDbgDentry(h_dentry);
+	ret = h_inode->i_op->get_link(h_dentry, h_inode, done);
+	dput(h_dentry);
+	if (IS_ERR(ret))
+		err = PTR_ERR(ret);
+
+out_unlock:
+	aufs_read_unlock(dentry, AuLock_IR);
+out:
+	if (unlikely(err))
+		ret = ERR_PTR(err);
+	AuTraceErrPtr(ret);
+	return ret;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_update_time(struct inode *inode, struct timespec *ts, int flags)
+{
+	int err;
+	struct super_block *sb;
+	struct inode *h_inode;
+
+	sb = inode->i_sb;
+	/* mmap_sem might be acquired already, cf. aufs_mmap() */
+	lockdep_off();
+	si_read_lock(sb, AuLock_FLUSH);
+	ii_write_lock_child(inode);
+	lockdep_on();
+	h_inode = au_h_iptr(inode, au_ibstart(inode));
+	err = vfsub_update_time(h_inode, ts, flags);
+	lockdep_off();
+	if (!err)
+		au_cpup_attr_timesizes(inode);
+	ii_write_unlock(inode);
+	si_read_unlock(sb);
+	lockdep_on();
+
+	if (!err && (flags & S_VERSION))
+		inode_inc_iversion(inode);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* no getattr version will be set by module.c:aufs_init() */
+struct inode_operations aufs_iop_nogetattr[AuIop_Last],
+	aufs_iop[] = {
+	[AuIop_SYMLINK] = {
+		.permission	= aufs_permission,
+#ifdef CONFIG_FS_POSIX_ACL
+		.get_acl	= aufs_get_acl,
+		.set_acl	= aufs_set_acl, /* unsupport for symlink? */
+#endif
+
+		.setattr	= aufs_setattr,
+		.getattr	= aufs_getattr,
+
+#ifdef CONFIG_AUFS_XATTR
+		.setxattr	= aufs_setxattr,
+		.getxattr	= aufs_getxattr,
+		.listxattr	= aufs_listxattr,
+		.removexattr	= aufs_removexattr,
+#endif
+
+		.readlink	= generic_readlink,
+		.get_link	= aufs_get_link,
+
+		/* .update_time	= aufs_update_time */
+	},
+	[AuIop_DIR] = {
+		.create		= aufs_create,
+		.lookup		= aufs_lookup,
+		.link		= aufs_link,
+		.unlink		= aufs_unlink,
+		.symlink	= aufs_symlink,
+		.mkdir		= aufs_mkdir,
+		.rmdir		= aufs_rmdir,
+		.mknod		= aufs_mknod,
+		.rename		= aufs_rename,
+
+		.permission	= aufs_permission,
+#ifdef CONFIG_FS_POSIX_ACL
+		.get_acl	= aufs_get_acl,
+		.set_acl	= aufs_set_acl,
+#endif
+
+		.setattr	= aufs_setattr,
+		.getattr	= aufs_getattr,
+
+#ifdef CONFIG_AUFS_XATTR
+		.setxattr	= aufs_setxattr,
+		.getxattr	= aufs_getxattr,
+		.listxattr	= aufs_listxattr,
+		.removexattr	= aufs_removexattr,
+#endif
+
+		.update_time	= aufs_update_time,
+		.atomic_open	= aufs_atomic_open,
+		.tmpfile	= aufs_tmpfile
+	},
+	[AuIop_OTHER] = {
+		.permission	= aufs_permission,
+#ifdef CONFIG_FS_POSIX_ACL
+		.get_acl	= aufs_get_acl,
+		.set_acl	= aufs_set_acl,
+#endif
+
+		.setattr	= aufs_setattr,
+		.getattr	= aufs_getattr,
+
+#ifdef CONFIG_AUFS_XATTR
+		.setxattr	= aufs_setxattr,
+		.getxattr	= aufs_getxattr,
+		.listxattr	= aufs_listxattr,
+		.removexattr	= aufs_removexattr,
+#endif
+
+		.update_time	= aufs_update_time
+	}
+};
diff --git b/fs/aufs/i_op_add.c b/fs/aufs/i_op_add.c
new file mode 100644
index 0000000..533b36a
--- /dev/null
+++ b/fs/aufs/i_op_add.c
@@ -0,0 +1,919 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode operations (add entry)
+ */
+
+#include "aufs.h"
+
+/*
+ * final procedure of adding a new entry, except link(2).
+ * remove whiteout, instantiate, copyup the parent dir's times and size
+ * and update version.
+ * if it failed, re-create the removed whiteout.
+ */
+static int epilog(struct inode *dir, aufs_bindex_t bindex,
+		  struct dentry *wh_dentry, struct dentry *dentry)
+{
+	int err, rerr;
+	aufs_bindex_t bwh;
+	struct path h_path;
+	struct super_block *sb;
+	struct inode *inode, *h_dir;
+	struct dentry *wh;
+
+	bwh = -1;
+	sb = dir->i_sb;
+	if (wh_dentry) {
+		h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */
+		IMustLock(h_dir);
+		AuDebugOn(au_h_iptr(dir, bindex) != h_dir);
+		bwh = au_dbwh(dentry);
+		h_path.dentry = wh_dentry;
+		h_path.mnt = au_sbr_mnt(sb, bindex);
+		err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path,
+					  dentry);
+		if (unlikely(err))
+			goto out;
+	}
+
+	inode = au_new_inode(dentry, /*must_new*/1);
+	if (!IS_ERR(inode)) {
+		d_instantiate(dentry, inode);
+		dir = d_inode(dentry->d_parent); /* dir inode is locked */
+		IMustLock(dir);
+		au_dir_ts(dir, bindex);
+		dir->i_version++;
+		au_fhsm_wrote(sb, bindex, /*force*/0);
+		return 0; /* success */
+	}
+
+	err = PTR_ERR(inode);
+	if (!wh_dentry)
+		goto out;
+
+	/* revert */
+	/* dir inode is locked */
+	wh = au_wh_create(dentry, bwh, wh_dentry->d_parent);
+	rerr = PTR_ERR(wh);
+	if (IS_ERR(wh)) {
+		AuIOErr("%pd reverting whiteout failed(%d, %d)\n",
+			dentry, err, rerr);
+		err = -EIO;
+	} else
+		dput(wh);
+
+out:
+	return err;
+}
+
+static int au_d_may_add(struct dentry *dentry)
+{
+	int err;
+
+	err = 0;
+	if (unlikely(d_unhashed(dentry)))
+		err = -ENOENT;
+	if (unlikely(d_really_is_positive(dentry)))
+		err = -EEXIST;
+	return err;
+}
+
+/*
+ * simple tests for the adding inode operations.
+ * following the checks in vfs, plus the parent-child relationship.
+ */
+int au_may_add(struct dentry *dentry, aufs_bindex_t bindex,
+	       struct dentry *h_parent, int isdir)
+{
+	int err;
+	umode_t h_mode;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+
+	err = -ENAMETOOLONG;
+	if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
+		goto out;
+
+	h_dentry = au_h_dptr(dentry, bindex);
+	if (d_really_is_negative(dentry)) {
+		err = -EEXIST;
+		if (unlikely(d_is_positive(h_dentry)))
+			goto out;
+	} else {
+		/* rename(2) case */
+		err = -EIO;
+		if (unlikely(d_is_negative(h_dentry)))
+			goto out;
+		h_inode = d_inode(h_dentry);
+		if (unlikely(!h_inode->i_nlink))
+			goto out;
+
+		h_mode = h_inode->i_mode;
+		if (!isdir) {
+			err = -EISDIR;
+			if (unlikely(S_ISDIR(h_mode)))
+				goto out;
+		} else if (unlikely(!S_ISDIR(h_mode))) {
+			err = -ENOTDIR;
+			goto out;
+		}
+	}
+
+	err = 0;
+	/* expected parent dir is locked */
+	if (unlikely(h_parent != h_dentry->d_parent))
+		err = -EIO;
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/*
+ * initial procedure of adding a new entry.
+ * prepare writable branch and the parent dir, lock it,
+ * and lookup whiteout for the new entry.
+ */
+static struct dentry*
+lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt,
+		  struct dentry *src_dentry, struct au_pin *pin,
+		  struct au_wr_dir_args *wr_dir_args)
+{
+	struct dentry *wh_dentry, *h_parent;
+	struct super_block *sb;
+	struct au_branch *br;
+	int err;
+	unsigned int udba;
+	aufs_bindex_t bcpup;
+
+	AuDbg("%pd\n", dentry);
+
+	err = au_wr_dir(dentry, src_dentry, wr_dir_args);
+	bcpup = err;
+	wh_dentry = ERR_PTR(err);
+	if (unlikely(err < 0))
+		goto out;
+
+	sb = dentry->d_sb;
+	udba = au_opt_udba(sb);
+	err = au_pin(pin, dentry, bcpup, udba,
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	wh_dentry = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	h_parent = au_pinned_h_parent(pin);
+	if (udba != AuOpt_UDBA_NONE
+	    && au_dbstart(dentry) == bcpup)
+		err = au_may_add(dentry, bcpup, h_parent,
+				 au_ftest_wrdir(wr_dir_args->flags, ISDIR));
+	else if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
+		err = -ENAMETOOLONG;
+	wh_dentry = ERR_PTR(err);
+	if (unlikely(err))
+		goto out_unpin;
+
+	br = au_sbr(sb, bcpup);
+	if (dt) {
+		struct path tmp = {
+			.dentry	= h_parent,
+			.mnt	= au_br_mnt(br)
+		};
+		au_dtime_store(dt, au_pinned_parent(pin), &tmp);
+	}
+
+	wh_dentry = NULL;
+	if (bcpup != au_dbwh(dentry))
+		goto out; /* success */
+
+	/*
+	 * ENAMETOOLONG here means that if we allowed create such name, then it
+	 * would not be able to removed in the future. So we don't allow such
+	 * name here and we don't handle ENAMETOOLONG differently here.
+	 */
+	wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br);
+
+out_unpin:
+	if (IS_ERR(wh_dentry))
+		au_unpin(pin);
+out:
+	return wh_dentry;
+}
+
+/* ---------------------------------------------------------------------- */
+
+enum { Mknod, Symlink, Creat };
+struct simple_arg {
+	int type;
+	union {
+		struct {
+			umode_t			mode;
+			bool			want_excl;
+			bool			try_aopen;
+			struct vfsub_aopen_args	*aopen;
+		} c;
+		struct {
+			const char *symname;
+		} s;
+		struct {
+			umode_t mode;
+			dev_t dev;
+		} m;
+	} u;
+};
+
+static int add_simple(struct inode *dir, struct dentry *dentry,
+		      struct simple_arg *arg)
+{
+	int err, rerr;
+	aufs_bindex_t bstart;
+	unsigned char created;
+	const unsigned char try_aopen
+		= (arg->type == Creat && arg->u.c.try_aopen);
+	struct dentry *wh_dentry, *parent;
+	struct inode *h_dir;
+	struct super_block *sb;
+	struct au_branch *br;
+	/* to reuduce stack size */
+	struct {
+		struct au_dtime dt;
+		struct au_pin pin;
+		struct path h_path;
+		struct au_wr_dir_args wr_dir_args;
+	} *a;
+
+	AuDbg("%pd\n", dentry);
+	IMustLock(dir);
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+	a->wr_dir_args.force_btgt = -1;
+	a->wr_dir_args.flags = AuWrDir_ADD_ENTRY;
+
+	parent = dentry->d_parent; /* dir inode is locked */
+	if (!try_aopen) {
+		err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
+		if (unlikely(err))
+			goto out_free;
+	}
+	err = au_d_may_add(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+	if (!try_aopen)
+		di_write_lock_parent(parent);
+	wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL,
+				      &a->pin, &a->wr_dir_args);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_parent;
+
+	bstart = au_dbstart(dentry);
+	sb = dentry->d_sb;
+	br = au_sbr(sb, bstart);
+	a->h_path.dentry = au_h_dptr(dentry, bstart);
+	a->h_path.mnt = au_br_mnt(br);
+	h_dir = au_pinned_h_dir(&a->pin);
+	switch (arg->type) {
+	case Creat:
+		err = 0;
+		if (!try_aopen || !h_dir->i_op->atomic_open)
+			err = vfsub_create(h_dir, &a->h_path, arg->u.c.mode,
+					   arg->u.c.want_excl);
+		else
+			err = vfsub_atomic_open(h_dir, a->h_path.dentry,
+						arg->u.c.aopen, br);
+		break;
+	case Symlink:
+		err = vfsub_symlink(h_dir, &a->h_path, arg->u.s.symname);
+		break;
+	case Mknod:
+		err = vfsub_mknod(h_dir, &a->h_path, arg->u.m.mode,
+				  arg->u.m.dev);
+		break;
+	default:
+		BUG();
+	}
+	created = !err;
+	if (!err)
+		err = epilog(dir, bstart, wh_dentry, dentry);
+
+	/* revert */
+	if (unlikely(created && err && d_is_positive(a->h_path.dentry))) {
+		/* no delegation since it is just created */
+		rerr = vfsub_unlink(h_dir, &a->h_path, /*delegated*/NULL,
+				    /*force*/0);
+		if (rerr) {
+			AuIOErr("%pd revert failure(%d, %d)\n",
+				dentry, err, rerr);
+			err = -EIO;
+		}
+		au_dtime_revert(&a->dt);
+	}
+
+	if (!err && try_aopen && !h_dir->i_op->atomic_open)
+		*arg->u.c.aopen->opened |= FILE_CREATED;
+
+	au_unpin(&a->pin);
+	dput(wh_dentry);
+
+out_parent:
+	if (!try_aopen)
+		di_write_unlock(parent);
+out_unlock:
+	if (unlikely(err)) {
+		au_update_dbstart(dentry);
+		d_drop(dentry);
+	}
+	if (!try_aopen)
+		aufs_read_unlock(dentry, AuLock_DW);
+out_free:
+	kfree(a);
+out:
+	return err;
+}
+
+int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+	       dev_t dev)
+{
+	struct simple_arg arg = {
+		.type = Mknod,
+		.u.m = {
+			.mode	= mode,
+			.dev	= dev
+		}
+	};
+	return add_simple(dir, dentry, &arg);
+}
+
+int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	struct simple_arg arg = {
+		.type = Symlink,
+		.u.s.symname = symname
+	};
+	return add_simple(dir, dentry, &arg);
+}
+
+int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool want_excl)
+{
+	struct simple_arg arg = {
+		.type = Creat,
+		.u.c = {
+			.mode		= mode,
+			.want_excl	= want_excl
+		}
+	};
+	return add_simple(dir, dentry, &arg);
+}
+
+int au_aopen_or_create(struct inode *dir, struct dentry *dentry,
+		       struct vfsub_aopen_args *aopen_args)
+{
+	struct simple_arg arg = {
+		.type = Creat,
+		.u.c = {
+			.mode		= aopen_args->create_mode,
+			.want_excl	= aopen_args->open_flag & O_EXCL,
+			.try_aopen	= true,
+			.aopen		= aopen_args
+		}
+	};
+	return add_simple(dir, dentry, &arg);
+}
+
+int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct super_block *sb;
+	struct dentry *parent, *h_parent, *h_dentry;
+	struct inode *h_dir, *inode;
+	struct vfsmount *h_mnt;
+	struct au_wr_dir_args wr_dir_args = {
+		.force_btgt	= -1,
+		.flags		= AuWrDir_TMPFILE
+	};
+
+	/* copy-up may happen */
+	inode_lock(dir);
+
+	sb = dir->i_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out;
+
+	err = au_di_init(dentry);
+	if (unlikely(err))
+		goto out_si;
+
+	err = -EBUSY;
+	parent = d_find_any_alias(dir);
+	AuDebugOn(!parent);
+	di_write_lock_parent(parent);
+	if (unlikely(d_inode(parent) != dir))
+		goto out_parent;
+
+	err = au_digen_test(parent, au_sigen(sb));
+	if (unlikely(err))
+		goto out_parent;
+
+	bindex = au_dbstart(parent);
+	au_set_dbstart(dentry, bindex);
+	au_set_dbend(dentry, bindex);
+	err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args);
+	bindex = err;
+	if (unlikely(err < 0))
+		goto out_parent;
+
+	err = -EOPNOTSUPP;
+	h_dir = au_h_iptr(dir, bindex);
+	if (unlikely(!h_dir->i_op->tmpfile))
+		goto out_parent;
+
+	h_mnt = au_sbr_mnt(sb, bindex);
+	err = vfsub_mnt_want_write(h_mnt);
+	if (unlikely(err))
+		goto out_parent;
+
+	h_parent = au_h_dptr(parent, bindex);
+	err = inode_permission(d_inode(h_parent), MAY_WRITE | MAY_EXEC);
+	if (unlikely(err))
+		goto out_mnt;
+
+	err = -ENOMEM;
+	h_dentry = d_alloc(h_parent, &dentry->d_name);
+	if (unlikely(!h_dentry))
+		goto out_mnt;
+
+	err = h_dir->i_op->tmpfile(h_dir, h_dentry, mode);
+	if (unlikely(err))
+		goto out_dentry;
+
+	au_set_dbstart(dentry, bindex);
+	au_set_dbend(dentry, bindex);
+	au_set_h_dptr(dentry, bindex, dget(h_dentry));
+	inode = au_new_inode(dentry, /*must_new*/1);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		au_set_h_dptr(dentry, bindex, NULL);
+		au_set_dbstart(dentry, -1);
+		au_set_dbend(dentry, -1);
+	} else {
+		if (!inode->i_nlink)
+			set_nlink(inode, 1);
+		d_tmpfile(dentry, inode);
+		au_di(dentry)->di_tmpfile = 1;
+
+		/* update without i_mutex */
+		if (au_ibstart(dir) == au_dbstart(dentry))
+			au_cpup_attr_timesizes(dir);
+	}
+
+out_dentry:
+	dput(h_dentry);
+out_mnt:
+	vfsub_mnt_drop_write(h_mnt);
+out_parent:
+	di_write_unlock(parent);
+	dput(parent);
+	di_write_unlock(dentry);
+	if (!err)
+#if 0
+		/* verbose coding for lock class name */
+		au_rw_class(&au_di(dentry)->di_rwsem,
+			    au_lc_key + AuLcNonDir_DIINFO);
+#else
+		;
+#endif
+	else {
+		au_di_fin(dentry);
+		dentry->d_fsdata = NULL;
+	}
+out_si:
+	si_read_unlock(sb);
+out:
+	inode_unlock(dir);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_link_args {
+	aufs_bindex_t bdst, bsrc;
+	struct au_pin pin;
+	struct path h_path;
+	struct dentry *src_parent, *parent;
+};
+
+static int au_cpup_before_link(struct dentry *src_dentry,
+			       struct au_link_args *a)
+{
+	int err;
+	struct dentry *h_src_dentry;
+	struct au_cp_generic cpg = {
+		.dentry	= src_dentry,
+		.bdst	= a->bdst,
+		.bsrc	= a->bsrc,
+		.len	= -1,
+		.pin	= &a->pin,
+		.flags	= AuCpup_DTIME | AuCpup_HOPEN /* | AuCpup_KEEPLINO */
+	};
+
+	di_read_lock_parent(a->src_parent, AuLock_IR);
+	err = au_test_and_cpup_dirs(src_dentry, a->bdst);
+	if (unlikely(err))
+		goto out;
+
+	h_src_dentry = au_h_dptr(src_dentry, a->bsrc);
+	err = au_pin(&a->pin, src_dentry, a->bdst,
+		     au_opt_udba(src_dentry->d_sb),
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	if (unlikely(err))
+		goto out;
+
+	err = au_sio_cpup_simple(&cpg);
+	au_unpin(&a->pin);
+
+out:
+	di_read_unlock(a->src_parent, AuLock_IR);
+	return err;
+}
+
+static int au_cpup_or_link(struct dentry *src_dentry, struct dentry *dentry,
+			   struct au_link_args *a)
+{
+	int err;
+	unsigned char plink;
+	aufs_bindex_t bend;
+	struct dentry *h_src_dentry;
+	struct inode *h_inode, *inode, *delegated;
+	struct super_block *sb;
+	struct file *h_file;
+
+	plink = 0;
+	h_inode = NULL;
+	sb = src_dentry->d_sb;
+	inode = d_inode(src_dentry);
+	if (au_ibstart(inode) <= a->bdst)
+		h_inode = au_h_iptr(inode, a->bdst);
+	if (!h_inode || !h_inode->i_nlink) {
+		/* copyup src_dentry as the name of dentry. */
+		bend = au_dbend(dentry);
+		if (bend < a->bsrc)
+			au_set_dbend(dentry, a->bsrc);
+		au_set_h_dptr(dentry, a->bsrc,
+			      dget(au_h_dptr(src_dentry, a->bsrc)));
+		dget(a->h_path.dentry);
+		au_set_h_dptr(dentry, a->bdst, NULL);
+		AuDbg("temporary d_inode...\n");
+		spin_lock(&dentry->d_lock);
+		dentry->d_inode = d_inode(src_dentry); /* tmp */
+		spin_unlock(&dentry->d_lock);
+		h_file = au_h_open_pre(dentry, a->bsrc, /*force_wr*/0);
+		if (IS_ERR(h_file))
+			err = PTR_ERR(h_file);
+		else {
+			struct au_cp_generic cpg = {
+				.dentry	= dentry,
+				.bdst	= a->bdst,
+				.bsrc	= -1,
+				.len	= -1,
+				.pin	= &a->pin,
+				.flags	= AuCpup_KEEPLINO
+			};
+			err = au_sio_cpup_simple(&cpg);
+			au_h_open_post(dentry, a->bsrc, h_file);
+			if (!err) {
+				dput(a->h_path.dentry);
+				a->h_path.dentry = au_h_dptr(dentry, a->bdst);
+			} else
+				au_set_h_dptr(dentry, a->bdst,
+					      a->h_path.dentry);
+		}
+		spin_lock(&dentry->d_lock);
+		dentry->d_inode = NULL; /* restore */
+		spin_unlock(&dentry->d_lock);
+		AuDbg("temporary d_inode...done\n");
+		au_set_h_dptr(dentry, a->bsrc, NULL);
+		au_set_dbend(dentry, bend);
+	} else {
+		/* the inode of src_dentry already exists on a.bdst branch */
+		h_src_dentry = d_find_alias(h_inode);
+		if (!h_src_dentry && au_plink_test(inode)) {
+			plink = 1;
+			h_src_dentry = au_plink_lkup(inode, a->bdst);
+			err = PTR_ERR(h_src_dentry);
+			if (IS_ERR(h_src_dentry))
+				goto out;
+
+			if (unlikely(d_is_negative(h_src_dentry))) {
+				dput(h_src_dentry);
+				h_src_dentry = NULL;
+			}
+
+		}
+		if (h_src_dentry) {
+			delegated = NULL;
+			err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin),
+					 &a->h_path, &delegated);
+			if (unlikely(err == -EWOULDBLOCK)) {
+				pr_warn("cannot retry for NFSv4 delegation"
+					" for an internal link\n");
+				iput(delegated);
+			}
+			dput(h_src_dentry);
+		} else {
+			AuIOErr("no dentry found for hi%lu on b%d\n",
+				h_inode->i_ino, a->bdst);
+			err = -EIO;
+		}
+	}
+
+	if (!err && !plink)
+		au_plink_append(inode, a->bdst, a->h_path.dentry);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int aufs_link(struct dentry *src_dentry, struct inode *dir,
+	      struct dentry *dentry)
+{
+	int err, rerr;
+	struct au_dtime dt;
+	struct au_link_args *a;
+	struct dentry *wh_dentry, *h_src_dentry;
+	struct inode *inode, *delegated;
+	struct super_block *sb;
+	struct au_wr_dir_args wr_dir_args = {
+		/* .force_btgt	= -1, */
+		.flags		= AuWrDir_ADD_ENTRY
+	};
+
+	IMustLock(dir);
+	inode = d_inode(src_dentry);
+	IMustLock(inode);
+
+	err = -ENOMEM;
+	a = kzalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	a->parent = dentry->d_parent; /* dir inode is locked */
+	err = aufs_read_and_write_lock2(dentry, src_dentry,
+					AuLock_NOPLM | AuLock_GEN);
+	if (unlikely(err))
+		goto out_kfree;
+	err = au_d_linkable(src_dentry);
+	if (unlikely(err))
+		goto out_unlock;
+	err = au_d_may_add(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+
+	a->src_parent = dget_parent(src_dentry);
+	wr_dir_args.force_btgt = au_ibstart(inode);
+
+	di_write_lock_parent(a->parent);
+	wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt);
+	wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin,
+				      &wr_dir_args);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_parent;
+
+	err = 0;
+	sb = dentry->d_sb;
+	a->bdst = au_dbstart(dentry);
+	a->h_path.dentry = au_h_dptr(dentry, a->bdst);
+	a->h_path.mnt = au_sbr_mnt(sb, a->bdst);
+	a->bsrc = au_ibstart(inode);
+	h_src_dentry = au_h_d_alias(src_dentry, a->bsrc);
+	if (!h_src_dentry && au_di(src_dentry)->di_tmpfile)
+		h_src_dentry = dget(au_hi_wh(inode, a->bsrc));
+	if (!h_src_dentry) {
+		a->bsrc = au_dbstart(src_dentry);
+		h_src_dentry = au_h_d_alias(src_dentry, a->bsrc);
+		AuDebugOn(!h_src_dentry);
+	} else if (IS_ERR(h_src_dentry)) {
+		err = PTR_ERR(h_src_dentry);
+		goto out_parent;
+	}
+
+	if (au_opt_test(au_mntflags(sb), PLINK)) {
+		if (a->bdst < a->bsrc
+		    /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */)
+			err = au_cpup_or_link(src_dentry, dentry, a);
+		else {
+			delegated = NULL;
+			err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin),
+					 &a->h_path, &delegated);
+			if (unlikely(err == -EWOULDBLOCK)) {
+				pr_warn("cannot retry for NFSv4 delegation"
+					" for an internal link\n");
+				iput(delegated);
+			}
+		}
+		dput(h_src_dentry);
+	} else {
+		/*
+		 * copyup src_dentry to the branch we process,
+		 * and then link(2) to it.
+		 */
+		dput(h_src_dentry);
+		if (a->bdst < a->bsrc
+		    /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) {
+			au_unpin(&a->pin);
+			di_write_unlock(a->parent);
+			err = au_cpup_before_link(src_dentry, a);
+			di_write_lock_parent(a->parent);
+			if (!err)
+				err = au_pin(&a->pin, dentry, a->bdst,
+					     au_opt_udba(sb),
+					     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+			if (unlikely(err))
+				goto out_wh;
+		}
+		if (!err) {
+			h_src_dentry = au_h_dptr(src_dentry, a->bdst);
+			err = -ENOENT;
+			if (h_src_dentry && d_is_positive(h_src_dentry)) {
+				delegated = NULL;
+				err = vfsub_link(h_src_dentry,
+						 au_pinned_h_dir(&a->pin),
+						 &a->h_path, &delegated);
+				if (unlikely(err == -EWOULDBLOCK)) {
+					pr_warn("cannot retry"
+						" for NFSv4 delegation"
+						" for an internal link\n");
+					iput(delegated);
+				}
+			}
+		}
+	}
+	if (unlikely(err))
+		goto out_unpin;
+
+	if (wh_dentry) {
+		a->h_path.dentry = wh_dentry;
+		err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path,
+					  dentry);
+		if (unlikely(err))
+			goto out_revert;
+	}
+
+	au_dir_ts(dir, a->bdst);
+	dir->i_version++;
+	inc_nlink(inode);
+	inode->i_ctime = dir->i_ctime;
+	d_instantiate(dentry, au_igrab(inode));
+	if (d_unhashed(a->h_path.dentry))
+		/* some filesystem calls d_drop() */
+		d_drop(dentry);
+	/* some filesystems consume an inode even hardlink */
+	au_fhsm_wrote(sb, a->bdst, /*force*/0);
+	goto out_unpin; /* success */
+
+out_revert:
+	/* no delegation since it is just created */
+	rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path,
+			    /*delegated*/NULL, /*force*/0);
+	if (unlikely(rerr)) {
+		AuIOErr("%pd reverting failed(%d, %d)\n", dentry, err, rerr);
+		err = -EIO;
+	}
+	au_dtime_revert(&dt);
+out_unpin:
+	au_unpin(&a->pin);
+out_wh:
+	dput(wh_dentry);
+out_parent:
+	di_write_unlock(a->parent);
+	dput(a->src_parent);
+out_unlock:
+	if (unlikely(err)) {
+		au_update_dbstart(dentry);
+		d_drop(dentry);
+	}
+	aufs_read_and_write_unlock2(dentry, src_dentry);
+out_kfree:
+	kfree(a);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int err, rerr;
+	aufs_bindex_t bindex;
+	unsigned char diropq;
+	struct path h_path;
+	struct dentry *wh_dentry, *parent, *opq_dentry;
+	struct inode *h_inode;
+	struct super_block *sb;
+	struct {
+		struct au_pin pin;
+		struct au_dtime dt;
+	} *a; /* reduce the stack usage */
+	struct au_wr_dir_args wr_dir_args = {
+		.force_btgt	= -1,
+		.flags		= AuWrDir_ADD_ENTRY | AuWrDir_ISDIR
+	};
+
+	IMustLock(dir);
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
+	if (unlikely(err))
+		goto out_free;
+	err = au_d_may_add(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+
+	parent = dentry->d_parent; /* dir inode is locked */
+	di_write_lock_parent(parent);
+	wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL,
+				      &a->pin, &wr_dir_args);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_parent;
+
+	sb = dentry->d_sb;
+	bindex = au_dbstart(dentry);
+	h_path.dentry = au_h_dptr(dentry, bindex);
+	h_path.mnt = au_sbr_mnt(sb, bindex);
+	err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode);
+	if (unlikely(err))
+		goto out_unpin;
+
+	/* make the dir opaque */
+	diropq = 0;
+	h_inode = d_inode(h_path.dentry);
+	if (wh_dentry
+	    || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) {
+		inode_lock_nested(h_inode, AuLsc_I_CHILD);
+		opq_dentry = au_diropq_create(dentry, bindex);
+		inode_unlock(h_inode);
+		err = PTR_ERR(opq_dentry);
+		if (IS_ERR(opq_dentry))
+			goto out_dir;
+		dput(opq_dentry);
+		diropq = 1;
+	}
+
+	err = epilog(dir, bindex, wh_dentry, dentry);
+	if (!err) {
+		inc_nlink(dir);
+		goto out_unpin; /* success */
+	}
+
+	/* revert */
+	if (diropq) {
+		AuLabel(revert opq);
+		inode_lock_nested(h_inode, AuLsc_I_CHILD);
+		rerr = au_diropq_remove(dentry, bindex);
+		inode_unlock(h_inode);
+		if (rerr) {
+			AuIOErr("%pd reverting diropq failed(%d, %d)\n",
+				dentry, err, rerr);
+			err = -EIO;
+		}
+	}
+
+out_dir:
+	AuLabel(revert dir);
+	rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path);
+	if (rerr) {
+		AuIOErr("%pd reverting dir failed(%d, %d)\n",
+			dentry, err, rerr);
+		err = -EIO;
+	}
+	au_dtime_revert(&a->dt);
+out_unpin:
+	au_unpin(&a->pin);
+	dput(wh_dentry);
+out_parent:
+	di_write_unlock(parent);
+out_unlock:
+	if (unlikely(err)) {
+		au_update_dbstart(dentry);
+		d_drop(dentry);
+	}
+	aufs_read_unlock(dentry, AuLock_DW);
+out_free:
+	kfree(a);
+out:
+	return err;
+}
diff --git b/fs/aufs/i_op_del.c b/fs/aufs/i_op_del.c
new file mode 100644
index 0000000..68741aa
--- /dev/null
+++ b/fs/aufs/i_op_del.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode operations (del entry)
+ */
+
+#include "aufs.h"
+
+/*
+ * decide if a new whiteout for @dentry is necessary or not.
+ * when it is necessary, prepare the parent dir for the upper branch whose
+ * branch index is @bcpup for creation. the actual creation of the whiteout will
+ * be done by caller.
+ * return value:
+ * 0: wh is unnecessary
+ * plus: wh is necessary
+ * minus: error
+ */
+int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup)
+{
+	int need_wh, err;
+	aufs_bindex_t bstart;
+	struct super_block *sb;
+
+	sb = dentry->d_sb;
+	bstart = au_dbstart(dentry);
+	if (*bcpup < 0) {
+		*bcpup = bstart;
+		if (au_test_ro(sb, bstart, d_inode(dentry))) {
+			err = AuWbrCopyup(au_sbi(sb), dentry);
+			*bcpup = err;
+			if (unlikely(err < 0))
+				goto out;
+		}
+	} else
+		AuDebugOn(bstart < *bcpup
+			  || au_test_ro(sb, *bcpup, d_inode(dentry)));
+	AuDbg("bcpup %d, bstart %d\n", *bcpup, bstart);
+
+	if (*bcpup != bstart) {
+		err = au_cpup_dirs(dentry, *bcpup);
+		if (unlikely(err))
+			goto out;
+		need_wh = 1;
+	} else {
+		struct au_dinfo *dinfo, *tmp;
+
+		need_wh = -ENOMEM;
+		dinfo = au_di(dentry);
+		tmp = au_di_alloc(sb, AuLsc_DI_TMP);
+		if (tmp) {
+			au_di_cp(tmp, dinfo);
+			au_di_swap(tmp, dinfo);
+			/* returns the number of positive dentries */
+			need_wh = au_lkup_dentry(dentry, bstart + 1, /*type*/0);
+			au_di_swap(tmp, dinfo);
+			au_rw_write_unlock(&tmp->di_rwsem);
+			au_di_free(tmp);
+		}
+	}
+	AuDbg("need_wh %d\n", need_wh);
+	err = need_wh;
+
+out:
+	return err;
+}
+
+/*
+ * simple tests for the del-entry operations.
+ * following the checks in vfs, plus the parent-child relationship.
+ */
+int au_may_del(struct dentry *dentry, aufs_bindex_t bindex,
+	       struct dentry *h_parent, int isdir)
+{
+	int err;
+	umode_t h_mode;
+	struct dentry *h_dentry, *h_latest;
+	struct inode *h_inode;
+
+	h_dentry = au_h_dptr(dentry, bindex);
+	if (d_really_is_positive(dentry)) {
+		err = -ENOENT;
+		if (unlikely(d_is_negative(h_dentry)))
+			goto out;
+		h_inode = d_inode(h_dentry);
+		if (unlikely(!h_inode->i_nlink))
+			goto out;
+
+		h_mode = h_inode->i_mode;
+		if (!isdir) {
+			err = -EISDIR;
+			if (unlikely(S_ISDIR(h_mode)))
+				goto out;
+		} else if (unlikely(!S_ISDIR(h_mode))) {
+			err = -ENOTDIR;
+			goto out;
+		}
+	} else {
+		/* rename(2) case */
+		err = -EIO;
+		if (unlikely(d_is_positive(h_dentry)))
+			goto out;
+	}
+
+	err = -ENOENT;
+	/* expected parent dir is locked */
+	if (unlikely(h_parent != h_dentry->d_parent))
+		goto out;
+	err = 0;
+
+	/*
+	 * rmdir a dir may break the consistency on some filesystem.
+	 * let's try heavy test.
+	 */
+	err = -EACCES;
+	if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1)
+		     && au_test_h_perm(d_inode(h_parent),
+				       MAY_EXEC | MAY_WRITE)))
+		goto out;
+
+	h_latest = au_sio_lkup_one(&dentry->d_name, h_parent);
+	err = -EIO;
+	if (IS_ERR(h_latest))
+		goto out;
+	if (h_latest == h_dentry)
+		err = 0;
+	dput(h_latest);
+
+out:
+	return err;
+}
+
+/*
+ * decide the branch where we operate for @dentry. the branch index will be set
+ * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent
+ * dir for reverting.
+ * when a new whiteout is necessary, create it.
+ */
+static struct dentry*
+lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup,
+		    struct au_dtime *dt, struct au_pin *pin)
+{
+	struct dentry *wh_dentry;
+	struct super_block *sb;
+	struct path h_path;
+	int err, need_wh;
+	unsigned int udba;
+	aufs_bindex_t bcpup;
+
+	need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup);
+	wh_dentry = ERR_PTR(need_wh);
+	if (unlikely(need_wh < 0))
+		goto out;
+
+	sb = dentry->d_sb;
+	udba = au_opt_udba(sb);
+	bcpup = *rbcpup;
+	err = au_pin(pin, dentry, bcpup, udba,
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	wh_dentry = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	h_path.dentry = au_pinned_h_parent(pin);
+	if (udba != AuOpt_UDBA_NONE
+	    && au_dbstart(dentry) == bcpup) {
+		err = au_may_del(dentry, bcpup, h_path.dentry, isdir);
+		wh_dentry = ERR_PTR(err);
+		if (unlikely(err))
+			goto out_unpin;
+	}
+
+	h_path.mnt = au_sbr_mnt(sb, bcpup);
+	au_dtime_store(dt, au_pinned_parent(pin), &h_path);
+	wh_dentry = NULL;
+	if (!need_wh)
+		goto out; /* success, no need to create whiteout */
+
+	wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_unpin;
+
+	/* returns with the parent is locked and wh_dentry is dget-ed */
+	goto out; /* success */
+
+out_unpin:
+	au_unpin(pin);
+out:
+	return wh_dentry;
+}
+
+/*
+ * when removing a dir, rename it to a unique temporary whiteout-ed name first
+ * in order to be revertible and save time for removing many child whiteouts
+ * under the dir.
+ * returns 1 when there are too many child whiteout and caller should remove
+ * them asynchronously. returns 0 when the number of children is enough small to
+ * remove now or the branch fs is a remote fs.
+ * otherwise return an error.
+ */
+static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex,
+			   struct au_nhash *whlist, struct inode *dir)
+{
+	int rmdir_later, err, dirwh;
+	struct dentry *h_dentry;
+	struct super_block *sb;
+	struct inode *inode;
+
+	sb = dentry->d_sb;
+	SiMustAnyLock(sb);
+	h_dentry = au_h_dptr(dentry, bindex);
+	err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex));
+	if (unlikely(err))
+		goto out;
+
+	/* stop monitoring */
+	inode = d_inode(dentry);
+	au_hn_free(au_hi(inode, bindex));
+
+	if (!au_test_fs_remote(h_dentry->d_sb)) {
+		dirwh = au_sbi(sb)->si_dirwh;
+		rmdir_later = (dirwh <= 1);
+		if (!rmdir_later)
+			rmdir_later = au_nhash_test_longer_wh(whlist, bindex,
+							      dirwh);
+		if (rmdir_later)
+			return rmdir_later;
+	}
+
+	err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist);
+	if (unlikely(err)) {
+		AuIOErr("rmdir %pd, b%d failed, %d. ignored\n",
+			h_dentry, bindex, err);
+		err = 0;
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/*
+ * final procedure for deleting a entry.
+ * maintain dentry and iattr.
+ */
+static void epilog(struct inode *dir, struct dentry *dentry,
+		   aufs_bindex_t bindex)
+{
+	struct inode *inode;
+
+	inode = d_inode(dentry);
+	d_drop(dentry);
+	inode->i_ctime = dir->i_ctime;
+
+	au_dir_ts(dir, bindex);
+	dir->i_version++;
+}
+
+/*
+ * when an error happened, remove the created whiteout and revert everything.
+ */
+static int do_revert(int err, struct inode *dir, aufs_bindex_t bindex,
+		     aufs_bindex_t bwh, struct dentry *wh_dentry,
+		     struct dentry *dentry, struct au_dtime *dt)
+{
+	int rerr;
+	struct path h_path = {
+		.dentry	= wh_dentry,
+		.mnt	= au_sbr_mnt(dir->i_sb, bindex)
+	};
+
+	rerr = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, dentry);
+	if (!rerr) {
+		au_set_dbwh(dentry, bwh);
+		au_dtime_revert(dt);
+		return 0;
+	}
+
+	AuIOErr("%pd reverting whiteout failed(%d, %d)\n", dentry, err, rerr);
+	return -EIO;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int aufs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int err;
+	aufs_bindex_t bwh, bindex, bstart;
+	struct inode *inode, *h_dir, *delegated;
+	struct dentry *parent, *wh_dentry;
+	/* to reuduce stack size */
+	struct {
+		struct au_dtime dt;
+		struct au_pin pin;
+		struct path h_path;
+	} *a;
+
+	IMustLock(dir);
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
+	if (unlikely(err))
+		goto out_free;
+	err = au_d_hashed_positive(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+	inode = d_inode(dentry);
+	IMustLock(inode);
+	err = -EISDIR;
+	if (unlikely(d_is_dir(dentry)))
+		goto out_unlock; /* possible? */
+
+	bstart = au_dbstart(dentry);
+	bwh = au_dbwh(dentry);
+	bindex = -1;
+	parent = dentry->d_parent; /* dir inode is locked */
+	di_write_lock_parent(parent);
+	wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &a->dt,
+					&a->pin);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_parent;
+
+	a->h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart);
+	a->h_path.dentry = au_h_dptr(dentry, bstart);
+	dget(a->h_path.dentry);
+	if (bindex == bstart) {
+		h_dir = au_pinned_h_dir(&a->pin);
+		delegated = NULL;
+		err = vfsub_unlink(h_dir, &a->h_path, &delegated, /*force*/0);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal unlink\n");
+			iput(delegated);
+		}
+	} else {
+		/* dir inode is locked */
+		h_dir = d_inode(wh_dentry->d_parent);
+		IMustLock(h_dir);
+		err = 0;
+	}
+
+	if (!err) {
+		vfsub_drop_nlink(inode);
+		epilog(dir, dentry, bindex);
+
+		/* update target timestamps */
+		if (bindex == bstart) {
+			vfsub_update_h_iattr(&a->h_path, /*did*/NULL);
+			/*ignore*/
+			inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime;
+		} else
+			/* todo: this timestamp may be reverted later */
+			inode->i_ctime = h_dir->i_ctime;
+		goto out_unpin; /* success */
+	}
+
+	/* revert */
+	if (wh_dentry) {
+		int rerr;
+
+		rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry,
+				 &a->dt);
+		if (rerr)
+			err = rerr;
+	}
+
+out_unpin:
+	au_unpin(&a->pin);
+	dput(wh_dentry);
+	dput(a->h_path.dentry);
+out_parent:
+	di_write_unlock(parent);
+out_unlock:
+	aufs_read_unlock(dentry, AuLock_DW);
+out_free:
+	kfree(a);
+out:
+	return err;
+}
+
+int aufs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int err, rmdir_later;
+	aufs_bindex_t bwh, bindex, bstart;
+	struct inode *inode;
+	struct dentry *parent, *wh_dentry, *h_dentry;
+	struct au_whtmp_rmdir *args;
+	/* to reuduce stack size */
+	struct {
+		struct au_dtime dt;
+		struct au_pin pin;
+	} *a;
+
+	IMustLock(dir);
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN);
+	if (unlikely(err))
+		goto out_free;
+	err = au_alive_dir(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+	inode = d_inode(dentry);
+	IMustLock(inode);
+	err = -ENOTDIR;
+	if (unlikely(!d_is_dir(dentry)))
+		goto out_unlock; /* possible? */
+
+	err = -ENOMEM;
+	args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS);
+	if (unlikely(!args))
+		goto out_unlock;
+
+	parent = dentry->d_parent; /* dir inode is locked */
+	di_write_lock_parent(parent);
+	err = au_test_empty(dentry, &args->whlist);
+	if (unlikely(err))
+		goto out_parent;
+
+	bstart = au_dbstart(dentry);
+	bwh = au_dbwh(dentry);
+	bindex = -1;
+	wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &a->dt,
+					&a->pin);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_parent;
+
+	h_dentry = au_h_dptr(dentry, bstart);
+	dget(h_dentry);
+	rmdir_later = 0;
+	if (bindex == bstart) {
+		err = renwh_and_rmdir(dentry, bstart, &args->whlist, dir);
+		if (err > 0) {
+			rmdir_later = err;
+			err = 0;
+		}
+	} else {
+		/* stop monitoring */
+		au_hn_free(au_hi(inode, bstart));
+
+		/* dir inode is locked */
+		IMustLock(d_inode(wh_dentry->d_parent));
+		err = 0;
+	}
+
+	if (!err) {
+		vfsub_dead_dir(inode);
+		au_set_dbdiropq(dentry, -1);
+		epilog(dir, dentry, bindex);
+
+		if (rmdir_later) {
+			au_whtmp_kick_rmdir(dir, bstart, h_dentry, args);
+			args = NULL;
+		}
+
+		goto out_unpin; /* success */
+	}
+
+	/* revert */
+	AuLabel(revert);
+	if (wh_dentry) {
+		int rerr;
+
+		rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry,
+				 &a->dt);
+		if (rerr)
+			err = rerr;
+	}
+
+out_unpin:
+	au_unpin(&a->pin);
+	dput(wh_dentry);
+	dput(h_dentry);
+out_parent:
+	di_write_unlock(parent);
+	if (args)
+		au_whtmp_rmdir_free(args);
+out_unlock:
+	aufs_read_unlock(dentry, AuLock_DW);
+out_free:
+	kfree(a);
+out:
+	AuTraceErr(err);
+	return err;
+}
diff --git b/fs/aufs/i_op_ren.c b/fs/aufs/i_op_ren.c
new file mode 100644
index 0000000..de4e03b
--- /dev/null
+++ b/fs/aufs/i_op_ren.c
@@ -0,0 +1,1002 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode operation (rename entry)
+ * todo: this is crazy monster
+ */
+
+#include "aufs.h"
+
+enum { AuSRC, AuDST, AuSrcDst };
+enum { AuPARENT, AuCHILD, AuParentChild };
+
+#define AuRen_ISDIR	1
+#define AuRen_ISSAMEDIR	(1 << 1)
+#define AuRen_WHSRC	(1 << 2)
+#define AuRen_WHDST	(1 << 3)
+#define AuRen_MNT_WRITE	(1 << 4)
+#define AuRen_DT_DSTDIR	(1 << 5)
+#define AuRen_DIROPQ	(1 << 6)
+#define au_ftest_ren(flags, name)	((flags) & AuRen_##name)
+#define au_fset_ren(flags, name) \
+	do { (flags) |= AuRen_##name; } while (0)
+#define au_fclr_ren(flags, name) \
+	do { (flags) &= ~AuRen_##name; } while (0)
+
+struct au_ren_args {
+	struct {
+		struct dentry *dentry, *h_dentry, *parent, *h_parent,
+			*wh_dentry;
+		struct inode *dir, *inode;
+		struct au_hinode *hdir;
+		struct au_dtime dt[AuParentChild];
+		aufs_bindex_t bstart;
+	} sd[AuSrcDst];
+
+#define src_dentry	sd[AuSRC].dentry
+#define src_dir		sd[AuSRC].dir
+#define src_inode	sd[AuSRC].inode
+#define src_h_dentry	sd[AuSRC].h_dentry
+#define src_parent	sd[AuSRC].parent
+#define src_h_parent	sd[AuSRC].h_parent
+#define src_wh_dentry	sd[AuSRC].wh_dentry
+#define src_hdir	sd[AuSRC].hdir
+#define src_h_dir	sd[AuSRC].hdir->hi_inode
+#define src_dt		sd[AuSRC].dt
+#define src_bstart	sd[AuSRC].bstart
+
+#define dst_dentry	sd[AuDST].dentry
+#define dst_dir		sd[AuDST].dir
+#define dst_inode	sd[AuDST].inode
+#define dst_h_dentry	sd[AuDST].h_dentry
+#define dst_parent	sd[AuDST].parent
+#define dst_h_parent	sd[AuDST].h_parent
+#define dst_wh_dentry	sd[AuDST].wh_dentry
+#define dst_hdir	sd[AuDST].hdir
+#define dst_h_dir	sd[AuDST].hdir->hi_inode
+#define dst_dt		sd[AuDST].dt
+#define dst_bstart	sd[AuDST].bstart
+
+	struct dentry *h_trap;
+	struct au_branch *br;
+	struct au_hinode *src_hinode;
+	struct path h_path;
+	struct au_nhash whlist;
+	aufs_bindex_t btgt, src_bwh, src_bdiropq;
+
+	unsigned int flags;
+
+	struct au_whtmp_rmdir *thargs;
+	struct dentry *h_dst;
+};
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * functions for reverting.
+ * when an error happened in a single rename systemcall, we should revert
+ * everything as if nothing happened.
+ * we don't need to revert the copied-up/down the parent dir since they are
+ * harmless.
+ */
+
+#define RevertFailure(fmt, ...) do { \
+	AuIOErr("revert failure: " fmt " (%d, %d)\n", \
+		##__VA_ARGS__, err, rerr); \
+	err = -EIO; \
+} while (0)
+
+static void au_ren_rev_diropq(int err, struct au_ren_args *a)
+{
+	int rerr;
+
+	au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD);
+	rerr = au_diropq_remove(a->src_dentry, a->btgt);
+	au_hn_imtx_unlock(a->src_hinode);
+	au_set_dbdiropq(a->src_dentry, a->src_bdiropq);
+	if (rerr)
+		RevertFailure("remove diropq %pd", a->src_dentry);
+}
+
+static void au_ren_rev_rename(int err, struct au_ren_args *a)
+{
+	int rerr;
+	struct inode *delegated;
+
+	a->h_path.dentry = vfsub_lkup_one(&a->src_dentry->d_name,
+					  a->src_h_parent);
+	rerr = PTR_ERR(a->h_path.dentry);
+	if (IS_ERR(a->h_path.dentry)) {
+		RevertFailure("lkup one %pd", a->src_dentry);
+		return;
+	}
+
+	delegated = NULL;
+	rerr = vfsub_rename(a->dst_h_dir,
+			    au_h_dptr(a->src_dentry, a->btgt),
+			    a->src_h_dir, &a->h_path, &delegated);
+	if (unlikely(rerr == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal rename\n");
+		iput(delegated);
+	}
+	d_drop(a->h_path.dentry);
+	dput(a->h_path.dentry);
+	/* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */
+	if (rerr)
+		RevertFailure("rename %pd", a->src_dentry);
+}
+
+static void au_ren_rev_whtmp(int err, struct au_ren_args *a)
+{
+	int rerr;
+	struct inode *delegated;
+
+	a->h_path.dentry = vfsub_lkup_one(&a->dst_dentry->d_name,
+					  a->dst_h_parent);
+	rerr = PTR_ERR(a->h_path.dentry);
+	if (IS_ERR(a->h_path.dentry)) {
+		RevertFailure("lkup one %pd", a->dst_dentry);
+		return;
+	}
+	if (d_is_positive(a->h_path.dentry)) {
+		d_drop(a->h_path.dentry);
+		dput(a->h_path.dentry);
+		return;
+	}
+
+	delegated = NULL;
+	rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path,
+			    &delegated);
+	if (unlikely(rerr == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal rename\n");
+		iput(delegated);
+	}
+	d_drop(a->h_path.dentry);
+	dput(a->h_path.dentry);
+	if (!rerr)
+		au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst));
+	else
+		RevertFailure("rename %pd", a->h_dst);
+}
+
+static void au_ren_rev_whsrc(int err, struct au_ren_args *a)
+{
+	int rerr;
+
+	a->h_path.dentry = a->src_wh_dentry;
+	rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry);
+	au_set_dbwh(a->src_dentry, a->src_bwh);
+	if (rerr)
+		RevertFailure("unlink %pd", a->src_wh_dentry);
+}
+#undef RevertFailure
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * when we have to copyup the renaming entry, do it with the rename-target name
+ * in order to minimize the cost (the later actual rename is unnecessary).
+ * otherwise rename it on the target branch.
+ */
+static int au_ren_or_cpup(struct au_ren_args *a)
+{
+	int err;
+	struct dentry *d;
+	struct inode *delegated;
+
+	d = a->src_dentry;
+	if (au_dbstart(d) == a->btgt) {
+		a->h_path.dentry = a->dst_h_dentry;
+		if (au_ftest_ren(a->flags, DIROPQ)
+		    && au_dbdiropq(d) == a->btgt)
+			au_fclr_ren(a->flags, DIROPQ);
+		AuDebugOn(au_dbstart(d) != a->btgt);
+		delegated = NULL;
+		err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt),
+				   a->dst_h_dir, &a->h_path, &delegated);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal rename\n");
+			iput(delegated);
+		}
+	} else
+		BUG();
+
+	if (!err && a->h_dst)
+		/* it will be set to dinfo later */
+		dget(a->h_dst);
+
+	return err;
+}
+
+/* cf. aufs_rmdir() */
+static int au_ren_del_whtmp(struct au_ren_args *a)
+{
+	int err;
+	struct inode *dir;
+
+	dir = a->dst_dir;
+	SiMustAnyLock(dir->i_sb);
+	if (!au_nhash_test_longer_wh(&a->whlist, a->btgt,
+				     au_sbi(dir->i_sb)->si_dirwh)
+	    || au_test_fs_remote(a->h_dst->d_sb)) {
+		err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist);
+		if (unlikely(err))
+			pr_warn("failed removing whtmp dir %pd (%d), "
+				"ignored.\n", a->h_dst, err);
+	} else {
+		au_nhash_wh_free(&a->thargs->whlist);
+		a->thargs->whlist = a->whlist;
+		a->whlist.nh_num = 0;
+		au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs);
+		dput(a->h_dst);
+		a->thargs = NULL;
+	}
+
+	return 0;
+}
+
+/* make it 'opaque' dir. */
+static int au_ren_diropq(struct au_ren_args *a)
+{
+	int err;
+	struct dentry *diropq;
+
+	err = 0;
+	a->src_bdiropq = au_dbdiropq(a->src_dentry);
+	a->src_hinode = au_hi(a->src_inode, a->btgt);
+	au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD);
+	diropq = au_diropq_create(a->src_dentry, a->btgt);
+	au_hn_imtx_unlock(a->src_hinode);
+	if (IS_ERR(diropq))
+		err = PTR_ERR(diropq);
+	else
+		dput(diropq);
+
+	return err;
+}
+
+static int do_rename(struct au_ren_args *a)
+{
+	int err;
+	struct dentry *d, *h_d;
+
+	/* prepare workqueue args for asynchronous rmdir */
+	h_d = a->dst_h_dentry;
+	if (au_ftest_ren(a->flags, ISDIR) && d_is_positive(h_d)) {
+		err = -ENOMEM;
+		a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS);
+		if (unlikely(!a->thargs))
+			goto out;
+		a->h_dst = dget(h_d);
+	}
+
+	/* create whiteout for src_dentry */
+	if (au_ftest_ren(a->flags, WHSRC)) {
+		a->src_bwh = au_dbwh(a->src_dentry);
+		AuDebugOn(a->src_bwh >= 0);
+		a->src_wh_dentry
+			= au_wh_create(a->src_dentry, a->btgt, a->src_h_parent);
+		err = PTR_ERR(a->src_wh_dentry);
+		if (IS_ERR(a->src_wh_dentry))
+			goto out_thargs;
+	}
+
+	/* lookup whiteout for dentry */
+	if (au_ftest_ren(a->flags, WHDST)) {
+		h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name,
+				 a->br);
+		err = PTR_ERR(h_d);
+		if (IS_ERR(h_d))
+			goto out_whsrc;
+		if (d_is_negative(h_d))
+			dput(h_d);
+		else
+			a->dst_wh_dentry = h_d;
+	}
+
+	/* rename dentry to tmpwh */
+	if (a->thargs) {
+		err = au_whtmp_ren(a->dst_h_dentry, a->br);
+		if (unlikely(err))
+			goto out_whdst;
+
+		d = a->dst_dentry;
+		au_set_h_dptr(d, a->btgt, NULL);
+		err = au_lkup_neg(d, a->btgt, /*wh*/0);
+		if (unlikely(err))
+			goto out_whtmp;
+		a->dst_h_dentry = au_h_dptr(d, a->btgt);
+	}
+
+	BUG_ON(d_is_positive(a->dst_h_dentry) && a->src_bstart != a->btgt);
+
+	/* rename by vfs_rename or cpup */
+	d = a->dst_dentry;
+	if (au_ftest_ren(a->flags, ISDIR)
+	    && (a->dst_wh_dentry
+		|| au_dbdiropq(d) == a->btgt
+		/* hide the lower to keep xino */
+		|| a->btgt < au_dbend(d)
+		|| au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ)))
+		au_fset_ren(a->flags, DIROPQ);
+	err = au_ren_or_cpup(a);
+	if (unlikely(err))
+		/* leave the copied-up one */
+		goto out_whtmp;
+
+	/* make dir opaque */
+	if (au_ftest_ren(a->flags, DIROPQ)) {
+		err = au_ren_diropq(a);
+		if (unlikely(err))
+			goto out_rename;
+	}
+
+	/* update target timestamps */
+	AuDebugOn(au_dbstart(a->src_dentry) != a->btgt);
+	a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt);
+	vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/
+	a->src_inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime;
+
+	/* remove whiteout for dentry */
+	if (a->dst_wh_dentry) {
+		a->h_path.dentry = a->dst_wh_dentry;
+		err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path,
+					  a->dst_dentry);
+		if (unlikely(err))
+			goto out_diropq;
+	}
+
+	/* remove whtmp */
+	if (a->thargs)
+		au_ren_del_whtmp(a); /* ignore this error */
+
+	au_fhsm_wrote(a->src_dentry->d_sb, a->btgt, /*force*/0);
+	err = 0;
+	goto out_success;
+
+out_diropq:
+	if (au_ftest_ren(a->flags, DIROPQ))
+		au_ren_rev_diropq(err, a);
+out_rename:
+	au_ren_rev_rename(err, a);
+	dput(a->h_dst);
+out_whtmp:
+	if (a->thargs)
+		au_ren_rev_whtmp(err, a);
+out_whdst:
+	dput(a->dst_wh_dentry);
+	a->dst_wh_dentry = NULL;
+out_whsrc:
+	if (a->src_wh_dentry)
+		au_ren_rev_whsrc(err, a);
+out_success:
+	dput(a->src_wh_dentry);
+	dput(a->dst_wh_dentry);
+out_thargs:
+	if (a->thargs) {
+		dput(a->h_dst);
+		au_whtmp_rmdir_free(a->thargs);
+		a->thargs = NULL;
+	}
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * test if @dentry dir can be rename destination or not.
+ * success means, it is a logically empty dir.
+ */
+static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist)
+{
+	return au_test_empty(dentry, whlist);
+}
+
+/*
+ * test if @dentry dir can be rename source or not.
+ * if it can, return 0 and @children is filled.
+ * success means,
+ * - it is a logically empty dir.
+ * - or, it exists on writable branch and has no children including whiteouts
+ *       on the lower branch.
+ */
+static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt)
+{
+	int err;
+	unsigned int rdhash;
+	aufs_bindex_t bstart;
+
+	bstart = au_dbstart(dentry);
+	if (bstart != btgt) {
+		struct au_nhash whlist;
+
+		SiMustAnyLock(dentry->d_sb);
+		rdhash = au_sbi(dentry->d_sb)->si_rdhash;
+		if (!rdhash)
+			rdhash = au_rdhash_est(au_dir_size(/*file*/NULL,
+							   dentry));
+		err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS);
+		if (unlikely(err))
+			goto out;
+		err = au_test_empty(dentry, &whlist);
+		au_nhash_wh_free(&whlist);
+		goto out;
+	}
+
+	if (bstart == au_dbtaildir(dentry))
+		return 0; /* success */
+
+	err = au_test_empty_lower(dentry);
+
+out:
+	if (err == -ENOTEMPTY) {
+		AuWarn1("renaming dir who has child(ren) on multiple branches,"
+			" is not supported\n");
+		err = -EXDEV;
+	}
+	return err;
+}
+
+/* side effect: sets whlist and h_dentry */
+static int au_ren_may_dir(struct au_ren_args *a)
+{
+	int err;
+	unsigned int rdhash;
+	struct dentry *d;
+
+	d = a->dst_dentry;
+	SiMustAnyLock(d->d_sb);
+
+	err = 0;
+	if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) {
+		rdhash = au_sbi(d->d_sb)->si_rdhash;
+		if (!rdhash)
+			rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d));
+		err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS);
+		if (unlikely(err))
+			goto out;
+
+		au_set_dbstart(d, a->dst_bstart);
+		err = may_rename_dstdir(d, &a->whlist);
+		au_set_dbstart(d, a->btgt);
+	}
+	a->dst_h_dentry = au_h_dptr(d, au_dbstart(d));
+	if (unlikely(err))
+		goto out;
+
+	d = a->src_dentry;
+	a->src_h_dentry = au_h_dptr(d, au_dbstart(d));
+	if (au_ftest_ren(a->flags, ISDIR)) {
+		err = may_rename_srcdir(d, a->btgt);
+		if (unlikely(err)) {
+			au_nhash_wh_free(&a->whlist);
+			a->whlist.nh_num = 0;
+		}
+	}
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * simple tests for rename.
+ * following the checks in vfs, plus the parent-child relationship.
+ */
+static int au_may_ren(struct au_ren_args *a)
+{
+	int err, isdir;
+	struct inode *h_inode;
+
+	if (a->src_bstart == a->btgt) {
+		err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent,
+				 au_ftest_ren(a->flags, ISDIR));
+		if (unlikely(err))
+			goto out;
+		err = -EINVAL;
+		if (unlikely(a->src_h_dentry == a->h_trap))
+			goto out;
+	}
+
+	err = 0;
+	if (a->dst_bstart != a->btgt)
+		goto out;
+
+	err = -ENOTEMPTY;
+	if (unlikely(a->dst_h_dentry == a->h_trap))
+		goto out;
+
+	err = -EIO;
+	isdir = !!au_ftest_ren(a->flags, ISDIR);
+	if (d_really_is_negative(a->dst_dentry)) {
+		if (d_is_negative(a->dst_h_dentry))
+			err = au_may_add(a->dst_dentry, a->btgt,
+					 a->dst_h_parent, isdir);
+	} else {
+		if (unlikely(d_is_negative(a->dst_h_dentry)))
+			goto out;
+		h_inode = d_inode(a->dst_h_dentry);
+		if (h_inode->i_nlink)
+			err = au_may_del(a->dst_dentry, a->btgt,
+					 a->dst_h_parent, isdir);
+	}
+
+out:
+	if (unlikely(err == -ENOENT || err == -EEXIST))
+		err = -EIO;
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * locking order
+ * (VFS)
+ * - src_dir and dir by lock_rename()
+ * - inode if exitsts
+ * (aufs)
+ * - lock all
+ *   + src_dentry and dentry by aufs_read_and_write_lock2() which calls,
+ *     + si_read_lock
+ *     + di_write_lock2_child()
+ *       + di_write_lock_child()
+ *	   + ii_write_lock_child()
+ *       + di_write_lock_child2()
+ *	   + ii_write_lock_child2()
+ *     + src_parent and parent
+ *       + di_write_lock_parent()
+ *	   + ii_write_lock_parent()
+ *       + di_write_lock_parent2()
+ *	   + ii_write_lock_parent2()
+ *   + lower src_dir and dir by vfsub_lock_rename()
+ *   + verify the every relationships between child and parent. if any
+ *     of them failed, unlock all and return -EBUSY.
+ */
+static void au_ren_unlock(struct au_ren_args *a)
+{
+	vfsub_unlock_rename(a->src_h_parent, a->src_hdir,
+			    a->dst_h_parent, a->dst_hdir);
+	if (au_ftest_ren(a->flags, MNT_WRITE))
+		vfsub_mnt_drop_write(au_br_mnt(a->br));
+}
+
+static int au_ren_lock(struct au_ren_args *a)
+{
+	int err;
+	unsigned int udba;
+
+	err = 0;
+	a->src_h_parent = au_h_dptr(a->src_parent, a->btgt);
+	a->src_hdir = au_hi(a->src_dir, a->btgt);
+	a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt);
+	a->dst_hdir = au_hi(a->dst_dir, a->btgt);
+
+	err = vfsub_mnt_want_write(au_br_mnt(a->br));
+	if (unlikely(err))
+		goto out;
+	au_fset_ren(a->flags, MNT_WRITE);
+	a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir,
+				      a->dst_h_parent, a->dst_hdir);
+	udba = au_opt_udba(a->src_dentry->d_sb);
+	if (unlikely(a->src_hdir->hi_inode != d_inode(a->src_h_parent)
+		     || a->dst_hdir->hi_inode != d_inode(a->dst_h_parent)))
+		err = au_busy_or_stale();
+	if (!err && au_dbstart(a->src_dentry) == a->btgt)
+		err = au_h_verify(a->src_h_dentry, udba,
+				  d_inode(a->src_h_parent), a->src_h_parent,
+				  a->br);
+	if (!err && au_dbstart(a->dst_dentry) == a->btgt)
+		err = au_h_verify(a->dst_h_dentry, udba,
+				  d_inode(a->dst_h_parent), a->dst_h_parent,
+				  a->br);
+	if (!err)
+		goto out; /* success */
+
+	err = au_busy_or_stale();
+	au_ren_unlock(a);
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void au_ren_refresh_dir(struct au_ren_args *a)
+{
+	struct inode *dir;
+
+	dir = a->dst_dir;
+	dir->i_version++;
+	if (au_ftest_ren(a->flags, ISDIR)) {
+		/* is this updating defined in POSIX? */
+		au_cpup_attr_timesizes(a->src_inode);
+		au_cpup_attr_nlink(dir, /*force*/1);
+	}
+
+	au_dir_ts(dir, a->btgt);
+
+	if (au_ftest_ren(a->flags, ISSAMEDIR))
+		return;
+
+	dir = a->src_dir;
+	dir->i_version++;
+	if (au_ftest_ren(a->flags, ISDIR))
+		au_cpup_attr_nlink(dir, /*force*/1);
+	au_dir_ts(dir, a->btgt);
+}
+
+static void au_ren_refresh(struct au_ren_args *a)
+{
+	aufs_bindex_t bend, bindex;
+	struct dentry *d, *h_d;
+	struct inode *i, *h_i;
+	struct super_block *sb;
+
+	d = a->dst_dentry;
+	d_drop(d);
+	if (a->h_dst)
+		/* already dget-ed by au_ren_or_cpup() */
+		au_set_h_dptr(d, a->btgt, a->h_dst);
+
+	i = a->dst_inode;
+	if (i) {
+		if (!au_ftest_ren(a->flags, ISDIR))
+			vfsub_drop_nlink(i);
+		else {
+			vfsub_dead_dir(i);
+			au_cpup_attr_timesizes(i);
+		}
+		au_update_dbrange(d, /*do_put_zero*/1);
+	} else {
+		bend = a->btgt;
+		for (bindex = au_dbstart(d); bindex < bend; bindex++)
+			au_set_h_dptr(d, bindex, NULL);
+		bend = au_dbend(d);
+		for (bindex = a->btgt + 1; bindex <= bend; bindex++)
+			au_set_h_dptr(d, bindex, NULL);
+		au_update_dbrange(d, /*do_put_zero*/0);
+	}
+
+	d = a->src_dentry;
+	au_set_dbwh(d, -1);
+	bend = au_dbend(d);
+	for (bindex = a->btgt + 1; bindex <= bend; bindex++) {
+		h_d = au_h_dptr(d, bindex);
+		if (h_d)
+			au_set_h_dptr(d, bindex, NULL);
+	}
+	au_set_dbend(d, a->btgt);
+
+	sb = d->d_sb;
+	i = a->src_inode;
+	if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i))
+		return; /* success */
+
+	bend = au_ibend(i);
+	for (bindex = a->btgt + 1; bindex <= bend; bindex++) {
+		h_i = au_h_iptr(i, bindex);
+		if (h_i) {
+			au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0);
+			/* ignore this error */
+			au_set_h_iptr(i, bindex, NULL, 0);
+		}
+	}
+	au_set_ibend(i, a->btgt);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* mainly for link(2) and rename(2) */
+int au_wbr(struct dentry *dentry, aufs_bindex_t btgt)
+{
+	aufs_bindex_t bdiropq, bwh;
+	struct dentry *parent;
+	struct au_branch *br;
+
+	parent = dentry->d_parent;
+	IMustLock(d_inode(parent)); /* dir is locked */
+
+	bdiropq = au_dbdiropq(parent);
+	bwh = au_dbwh(dentry);
+	br = au_sbr(dentry->d_sb, btgt);
+	if (au_br_rdonly(br)
+	    || (0 <= bdiropq && bdiropq < btgt)
+	    || (0 <= bwh && bwh < btgt))
+		btgt = -1;
+
+	AuDbg("btgt %d\n", btgt);
+	return btgt;
+}
+
+/* sets src_bstart, dst_bstart and btgt */
+static int au_ren_wbr(struct au_ren_args *a)
+{
+	int err;
+	struct au_wr_dir_args wr_dir_args = {
+		/* .force_btgt	= -1, */
+		.flags		= AuWrDir_ADD_ENTRY
+	};
+
+	a->src_bstart = au_dbstart(a->src_dentry);
+	a->dst_bstart = au_dbstart(a->dst_dentry);
+	if (au_ftest_ren(a->flags, ISDIR))
+		au_fset_wrdir(wr_dir_args.flags, ISDIR);
+	wr_dir_args.force_btgt = a->src_bstart;
+	if (a->dst_inode && a->dst_bstart < a->src_bstart)
+		wr_dir_args.force_btgt = a->dst_bstart;
+	wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt);
+	err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args);
+	a->btgt = err;
+
+	return err;
+}
+
+static void au_ren_dt(struct au_ren_args *a)
+{
+	a->h_path.dentry = a->src_h_parent;
+	au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path);
+	if (!au_ftest_ren(a->flags, ISSAMEDIR)) {
+		a->h_path.dentry = a->dst_h_parent;
+		au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path);
+	}
+
+	au_fclr_ren(a->flags, DT_DSTDIR);
+	if (!au_ftest_ren(a->flags, ISDIR))
+		return;
+
+	a->h_path.dentry = a->src_h_dentry;
+	au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path);
+	if (d_is_positive(a->dst_h_dentry)) {
+		au_fset_ren(a->flags, DT_DSTDIR);
+		a->h_path.dentry = a->dst_h_dentry;
+		au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path);
+	}
+}
+
+static void au_ren_rev_dt(int err, struct au_ren_args *a)
+{
+	struct dentry *h_d;
+	struct inode *h_inode;
+
+	au_dtime_revert(a->src_dt + AuPARENT);
+	if (!au_ftest_ren(a->flags, ISSAMEDIR))
+		au_dtime_revert(a->dst_dt + AuPARENT);
+
+	if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) {
+		h_d = a->src_dt[AuCHILD].dt_h_path.dentry;
+		h_inode = d_inode(h_d);
+		inode_lock_nested(h_inode, AuLsc_I_CHILD);
+		au_dtime_revert(a->src_dt + AuCHILD);
+		inode_unlock(h_inode);
+
+		if (au_ftest_ren(a->flags, DT_DSTDIR)) {
+			h_d = a->dst_dt[AuCHILD].dt_h_path.dentry;
+			h_inode = d_inode(h_d);
+			inode_lock_nested(h_inode, AuLsc_I_CHILD);
+			au_dtime_revert(a->dst_dt + AuCHILD);
+			inode_unlock(h_inode);
+		}
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry,
+		struct inode *_dst_dir, struct dentry *_dst_dentry)
+{
+	int err, flags;
+	/* reduce stack space */
+	struct au_ren_args *a;
+
+	AuDbg("%pd, %pd\n", _src_dentry, _dst_dentry);
+	IMustLock(_src_dir);
+	IMustLock(_dst_dir);
+
+	err = -ENOMEM;
+	BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE);
+	a = kzalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	a->src_dir = _src_dir;
+	a->src_dentry = _src_dentry;
+	a->src_inode = NULL;
+	if (d_really_is_positive(a->src_dentry))
+		a->src_inode = d_inode(a->src_dentry);
+	a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */
+	a->dst_dir = _dst_dir;
+	a->dst_dentry = _dst_dentry;
+	a->dst_inode = NULL;
+	if (d_really_is_positive(a->dst_dentry))
+		a->dst_inode = d_inode(a->dst_dentry);
+	a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */
+	if (a->dst_inode) {
+		IMustLock(a->dst_inode);
+		au_igrab(a->dst_inode);
+	}
+
+	err = -ENOTDIR;
+	flags = AuLock_FLUSH | AuLock_NOPLM | AuLock_GEN;
+	if (d_is_dir(a->src_dentry)) {
+		au_fset_ren(a->flags, ISDIR);
+		if (unlikely(d_really_is_positive(a->dst_dentry)
+			     && !d_is_dir(a->dst_dentry)))
+			goto out_free;
+		flags |= AuLock_DIRS;
+	}
+	err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, flags);
+	if (unlikely(err))
+		goto out_free;
+
+	err = au_d_hashed_positive(a->src_dentry);
+	if (unlikely(err))
+		goto out_unlock;
+	err = -ENOENT;
+	if (a->dst_inode) {
+		/*
+		 * If it is a dir, VFS unhash dst_dentry before this
+		 * function. It means we cannot rely upon d_unhashed().
+		 */
+		if (unlikely(!a->dst_inode->i_nlink))
+			goto out_unlock;
+		if (!S_ISDIR(a->dst_inode->i_mode)) {
+			err = au_d_hashed_positive(a->dst_dentry);
+			if (unlikely(err))
+				goto out_unlock;
+		} else if (unlikely(IS_DEADDIR(a->dst_inode)))
+			goto out_unlock;
+	} else if (unlikely(d_unhashed(a->dst_dentry)))
+		goto out_unlock;
+
+	/*
+	 * is it possible?
+	 * yes, it happened (in linux-3.3-rcN) but I don't know why.
+	 * there may exist a problem somewhere else.
+	 */
+	err = -EINVAL;
+	if (unlikely(d_inode(a->dst_parent) == d_inode(a->src_dentry)))
+		goto out_unlock;
+
+	au_fset_ren(a->flags, ISSAMEDIR); /* temporary */
+	di_write_lock_parent(a->dst_parent);
+
+	/* which branch we process */
+	err = au_ren_wbr(a);
+	if (unlikely(err < 0))
+		goto out_parent;
+	a->br = au_sbr(a->dst_dentry->d_sb, a->btgt);
+	a->h_path.mnt = au_br_mnt(a->br);
+
+	/* are they available to be renamed */
+	err = au_ren_may_dir(a);
+	if (unlikely(err))
+		goto out_children;
+
+	/* prepare the writable parent dir on the same branch */
+	if (a->dst_bstart == a->btgt) {
+		au_fset_ren(a->flags, WHDST);
+	} else {
+		err = au_cpup_dirs(a->dst_dentry, a->btgt);
+		if (unlikely(err))
+			goto out_children;
+	}
+
+	if (a->src_dir != a->dst_dir) {
+		/*
+		 * this temporary unlock is safe,
+		 * because both dir->i_mutex are locked.
+		 */
+		di_write_unlock(a->dst_parent);
+		di_write_lock_parent(a->src_parent);
+		err = au_wr_dir_need_wh(a->src_dentry,
+					au_ftest_ren(a->flags, ISDIR),
+					&a->btgt);
+		di_write_unlock(a->src_parent);
+		di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1);
+		au_fclr_ren(a->flags, ISSAMEDIR);
+	} else
+		err = au_wr_dir_need_wh(a->src_dentry,
+					au_ftest_ren(a->flags, ISDIR),
+					&a->btgt);
+	if (unlikely(err < 0))
+		goto out_children;
+	if (err)
+		au_fset_ren(a->flags, WHSRC);
+
+	/* cpup src */
+	if (a->src_bstart != a->btgt) {
+		struct au_pin pin;
+
+		err = au_pin(&pin, a->src_dentry, a->btgt,
+			     au_opt_udba(a->src_dentry->d_sb),
+			     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+		if (!err) {
+			struct au_cp_generic cpg = {
+				.dentry	= a->src_dentry,
+				.bdst	= a->btgt,
+				.bsrc	= a->src_bstart,
+				.len	= -1,
+				.pin	= &pin,
+				.flags	= AuCpup_DTIME | AuCpup_HOPEN
+			};
+			AuDebugOn(au_dbstart(a->src_dentry) != a->src_bstart);
+			err = au_sio_cpup_simple(&cpg);
+			au_unpin(&pin);
+		}
+		if (unlikely(err))
+			goto out_children;
+		a->src_bstart = a->btgt;
+		a->src_h_dentry = au_h_dptr(a->src_dentry, a->btgt);
+		au_fset_ren(a->flags, WHSRC);
+	}
+
+	/* lock them all */
+	err = au_ren_lock(a);
+	if (unlikely(err))
+		/* leave the copied-up one */
+		goto out_children;
+
+	if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE))
+		err = au_may_ren(a);
+	else if (unlikely(a->dst_dentry->d_name.len > AUFS_MAX_NAMELEN))
+		err = -ENAMETOOLONG;
+	if (unlikely(err))
+		goto out_hdir;
+
+	/* store timestamps to be revertible */
+	au_ren_dt(a);
+
+	/* here we go */
+	err = do_rename(a);
+	if (unlikely(err))
+		goto out_dt;
+
+	/* update dir attributes */
+	au_ren_refresh_dir(a);
+
+	/* dput/iput all lower dentries */
+	au_ren_refresh(a);
+
+	goto out_hdir; /* success */
+
+out_dt:
+	au_ren_rev_dt(err, a);
+out_hdir:
+	au_ren_unlock(a);
+out_children:
+	au_nhash_wh_free(&a->whlist);
+	if (err && a->dst_inode && a->dst_bstart != a->btgt) {
+		AuDbg("bstart %d, btgt %d\n", a->dst_bstart, a->btgt);
+		au_set_h_dptr(a->dst_dentry, a->btgt, NULL);
+		au_set_dbstart(a->dst_dentry, a->dst_bstart);
+	}
+out_parent:
+	if (!err)
+		d_move(a->src_dentry, a->dst_dentry);
+	else {
+		au_update_dbstart(a->dst_dentry);
+		if (!a->dst_inode)
+			d_drop(a->dst_dentry);
+	}
+	if (au_ftest_ren(a->flags, ISSAMEDIR))
+		di_write_unlock(a->dst_parent);
+	else
+		di_write_unlock2(a->src_parent, a->dst_parent);
+out_unlock:
+	aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry);
+out_free:
+	iput(a->dst_inode);
+	if (a->thargs)
+		au_whtmp_rmdir_free(a->thargs);
+	kfree(a);
+out:
+	AuTraceErr(err);
+	return err;
+}
diff --git b/fs/aufs/iinfo.c b/fs/aufs/iinfo.c
new file mode 100644
index 0000000..67ef672
--- /dev/null
+++ b/fs/aufs/iinfo.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode private data
+ */
+
+#include "aufs.h"
+
+struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex)
+{
+	struct inode *h_inode;
+
+	IiMustAnyLock(inode);
+
+	h_inode = au_ii(inode)->ii_hinode[0 + bindex].hi_inode;
+	AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0);
+	return h_inode;
+}
+
+/* todo: hard/soft set? */
+void au_hiput(struct au_hinode *hinode)
+{
+	au_hn_free(hinode);
+	dput(hinode->hi_whdentry);
+	iput(hinode->hi_inode);
+}
+
+unsigned int au_hi_flags(struct inode *inode, int isdir)
+{
+	unsigned int flags;
+	const unsigned int mnt_flags = au_mntflags(inode->i_sb);
+
+	flags = 0;
+	if (au_opt_test(mnt_flags, XINO))
+		au_fset_hi(flags, XINO);
+	if (isdir && au_opt_test(mnt_flags, UDBA_HNOTIFY))
+		au_fset_hi(flags, HNOTIFY);
+	return flags;
+}
+
+void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex,
+		   struct inode *h_inode, unsigned int flags)
+{
+	struct au_hinode *hinode;
+	struct inode *hi;
+	struct au_iinfo *iinfo = au_ii(inode);
+
+	IiMustWriteLock(inode);
+
+	hinode = iinfo->ii_hinode + bindex;
+	hi = hinode->hi_inode;
+	AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0);
+
+	if (hi)
+		au_hiput(hinode);
+	hinode->hi_inode = h_inode;
+	if (h_inode) {
+		int err;
+		struct super_block *sb = inode->i_sb;
+		struct au_branch *br;
+
+		AuDebugOn(inode->i_mode
+			  && (h_inode->i_mode & S_IFMT)
+			  != (inode->i_mode & S_IFMT));
+		if (bindex == iinfo->ii_bstart)
+			au_cpup_igen(inode, h_inode);
+		br = au_sbr(sb, bindex);
+		hinode->hi_id = br->br_id;
+		if (au_ftest_hi(flags, XINO)) {
+			err = au_xino_write(sb, bindex, h_inode->i_ino,
+					    inode->i_ino);
+			if (unlikely(err))
+				AuIOErr1("failed au_xino_write() %d\n", err);
+		}
+
+		if (au_ftest_hi(flags, HNOTIFY)
+		    && au_br_hnotifyable(br->br_perm)) {
+			err = au_hn_alloc(hinode, inode);
+			if (unlikely(err))
+				AuIOErr1("au_hn_alloc() %d\n", err);
+		}
+	}
+}
+
+void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex,
+		  struct dentry *h_wh)
+{
+	struct au_hinode *hinode;
+
+	IiMustWriteLock(inode);
+
+	hinode = au_ii(inode)->ii_hinode + bindex;
+	AuDebugOn(hinode->hi_whdentry);
+	hinode->hi_whdentry = h_wh;
+}
+
+void au_update_iigen(struct inode *inode, int half)
+{
+	struct au_iinfo *iinfo;
+	struct au_iigen *iigen;
+	unsigned int sigen;
+
+	sigen = au_sigen(inode->i_sb);
+	iinfo = au_ii(inode);
+	iigen = &iinfo->ii_generation;
+	spin_lock(&iigen->ig_spin);
+	iigen->ig_generation = sigen;
+	if (half)
+		au_ig_fset(iigen->ig_flags, HALF_REFRESHED);
+	else
+		au_ig_fclr(iigen->ig_flags, HALF_REFRESHED);
+	spin_unlock(&iigen->ig_spin);
+}
+
+/* it may be called at remount time, too */
+void au_update_ibrange(struct inode *inode, int do_put_zero)
+{
+	struct au_iinfo *iinfo;
+	aufs_bindex_t bindex, bend;
+
+	iinfo = au_ii(inode);
+	if (!iinfo)
+		return;
+
+	IiMustWriteLock(inode);
+
+	if (do_put_zero && iinfo->ii_bstart >= 0) {
+		for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend;
+		     bindex++) {
+			struct inode *h_i;
+
+			h_i = iinfo->ii_hinode[0 + bindex].hi_inode;
+			if (h_i
+			    && !h_i->i_nlink
+			    && !(h_i->i_state & I_LINKABLE))
+				au_set_h_iptr(inode, bindex, NULL, 0);
+		}
+	}
+
+	iinfo->ii_bstart = -1;
+	iinfo->ii_bend = -1;
+	bend = au_sbend(inode->i_sb);
+	for (bindex = 0; bindex <= bend; bindex++)
+		if (iinfo->ii_hinode[0 + bindex].hi_inode) {
+			iinfo->ii_bstart = bindex;
+			break;
+		}
+	if (iinfo->ii_bstart >= 0)
+		for (bindex = bend; bindex >= iinfo->ii_bstart; bindex--)
+			if (iinfo->ii_hinode[0 + bindex].hi_inode) {
+				iinfo->ii_bend = bindex;
+				break;
+			}
+	AuDebugOn(iinfo->ii_bstart > iinfo->ii_bend);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_icntnr_init_once(void *_c)
+{
+	struct au_icntnr *c = _c;
+	struct au_iinfo *iinfo = &c->iinfo;
+	static struct lock_class_key aufs_ii;
+
+	spin_lock_init(&iinfo->ii_generation.ig_spin);
+	au_rw_init(&iinfo->ii_rwsem);
+	au_rw_class(&iinfo->ii_rwsem, &aufs_ii);
+	inode_init_once(&c->vfs_inode);
+}
+
+int au_iinfo_init(struct inode *inode)
+{
+	struct au_iinfo *iinfo;
+	struct super_block *sb;
+	int nbr, i;
+
+	sb = inode->i_sb;
+	iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo);
+	nbr = au_sbend(sb) + 1;
+	if (unlikely(nbr <= 0))
+		nbr = 1;
+	iinfo->ii_hinode = kcalloc(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS);
+	if (iinfo->ii_hinode) {
+		au_ninodes_inc(sb);
+		for (i = 0; i < nbr; i++)
+			iinfo->ii_hinode[i].hi_id = -1;
+
+		iinfo->ii_generation.ig_generation = au_sigen(sb);
+		iinfo->ii_bstart = -1;
+		iinfo->ii_bend = -1;
+		iinfo->ii_vdir = NULL;
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+int au_ii_realloc(struct au_iinfo *iinfo, int nbr)
+{
+	int err, sz;
+	struct au_hinode *hip;
+
+	AuRwMustWriteLock(&iinfo->ii_rwsem);
+
+	err = -ENOMEM;
+	sz = sizeof(*hip) * (iinfo->ii_bend + 1);
+	if (!sz)
+		sz = sizeof(*hip);
+	hip = au_kzrealloc(iinfo->ii_hinode, sz, sizeof(*hip) * nbr, GFP_NOFS);
+	if (hip) {
+		iinfo->ii_hinode = hip;
+		err = 0;
+	}
+
+	return err;
+}
+
+void au_iinfo_fin(struct inode *inode)
+{
+	struct au_iinfo *iinfo;
+	struct au_hinode *hi;
+	struct super_block *sb;
+	aufs_bindex_t bindex, bend;
+	const unsigned char unlinked = !inode->i_nlink;
+
+	iinfo = au_ii(inode);
+	/* bad_inode case */
+	if (!iinfo)
+		return;
+
+	sb = inode->i_sb;
+	au_ninodes_dec(sb);
+	if (si_pid_test(sb))
+		au_xino_delete_inode(inode, unlinked);
+	else {
+		/*
+		 * it is safe to hide the dependency between sbinfo and
+		 * sb->s_umount.
+		 */
+		lockdep_off();
+		si_noflush_read_lock(sb);
+		au_xino_delete_inode(inode, unlinked);
+		si_read_unlock(sb);
+		lockdep_on();
+	}
+
+	if (iinfo->ii_vdir)
+		au_vdir_free(iinfo->ii_vdir);
+
+	bindex = iinfo->ii_bstart;
+	if (bindex >= 0) {
+		hi = iinfo->ii_hinode + bindex;
+		bend = iinfo->ii_bend;
+		while (bindex++ <= bend) {
+			if (hi->hi_inode)
+				au_hiput(hi);
+			hi++;
+		}
+	}
+	kfree(iinfo->ii_hinode);
+	iinfo->ii_hinode = NULL;
+	AuRwDestroy(&iinfo->ii_rwsem);
+}
diff --git b/fs/aufs/inode.c b/fs/aufs/inode.c
new file mode 100644
index 0000000..5a87727
--- /dev/null
+++ b/fs/aufs/inode.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode functions
+ */
+
+#include "aufs.h"
+
+struct inode *au_igrab(struct inode *inode)
+{
+	if (inode) {
+		AuDebugOn(!atomic_read(&inode->i_count));
+		ihold(inode);
+	}
+	return inode;
+}
+
+static void au_refresh_hinode_attr(struct inode *inode, int do_version)
+{
+	au_cpup_attr_all(inode, /*force*/0);
+	au_update_iigen(inode, /*half*/1);
+	if (do_version)
+		inode->i_version++;
+}
+
+static int au_ii_refresh(struct inode *inode, int *update)
+{
+	int err, e;
+	umode_t type;
+	aufs_bindex_t bindex, new_bindex;
+	struct super_block *sb;
+	struct au_iinfo *iinfo;
+	struct au_hinode *p, *q, tmp;
+
+	IiMustWriteLock(inode);
+
+	*update = 0;
+	sb = inode->i_sb;
+	type = inode->i_mode & S_IFMT;
+	iinfo = au_ii(inode);
+	err = au_ii_realloc(iinfo, au_sbend(sb) + 1);
+	if (unlikely(err))
+		goto out;
+
+	AuDebugOn(iinfo->ii_bstart < 0);
+	p = iinfo->ii_hinode + iinfo->ii_bstart;
+	for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend;
+	     bindex++, p++) {
+		if (!p->hi_inode)
+			continue;
+
+		AuDebugOn(type != (p->hi_inode->i_mode & S_IFMT));
+		new_bindex = au_br_index(sb, p->hi_id);
+		if (new_bindex == bindex)
+			continue;
+
+		if (new_bindex < 0) {
+			*update = 1;
+			au_hiput(p);
+			p->hi_inode = NULL;
+			continue;
+		}
+
+		if (new_bindex < iinfo->ii_bstart)
+			iinfo->ii_bstart = new_bindex;
+		if (iinfo->ii_bend < new_bindex)
+			iinfo->ii_bend = new_bindex;
+		/* swap two lower inode, and loop again */
+		q = iinfo->ii_hinode + new_bindex;
+		tmp = *q;
+		*q = *p;
+		*p = tmp;
+		if (tmp.hi_inode) {
+			bindex--;
+			p--;
+		}
+	}
+	au_update_ibrange(inode, /*do_put_zero*/0);
+	e = au_dy_irefresh(inode);
+	if (unlikely(e && !err))
+		err = e;
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+void au_refresh_iop(struct inode *inode, int force_getattr)
+{
+	int type;
+	struct au_sbinfo *sbi = au_sbi(inode->i_sb);
+	const struct inode_operations *iop
+		= force_getattr ? aufs_iop : sbi->si_iop_array;
+
+	if (inode->i_op == iop)
+		return;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		type = AuIop_DIR;
+		break;
+	case S_IFLNK:
+		type = AuIop_SYMLINK;
+		break;
+	default:
+		type = AuIop_OTHER;
+		break;
+	}
+
+	inode->i_op = iop + type;
+	/* unnecessary smp_wmb() */
+}
+
+int au_refresh_hinode_self(struct inode *inode)
+{
+	int err, update;
+
+	err = au_ii_refresh(inode, &update);
+	if (!err)
+		au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode));
+
+	AuTraceErr(err);
+	return err;
+}
+
+int au_refresh_hinode(struct inode *inode, struct dentry *dentry)
+{
+	int err, e, update;
+	unsigned int flags;
+	umode_t mode;
+	aufs_bindex_t bindex, bend;
+	unsigned char isdir;
+	struct au_hinode *p;
+	struct au_iinfo *iinfo;
+
+	err = au_ii_refresh(inode, &update);
+	if (unlikely(err))
+		goto out;
+
+	update = 0;
+	iinfo = au_ii(inode);
+	p = iinfo->ii_hinode + iinfo->ii_bstart;
+	mode = (inode->i_mode & S_IFMT);
+	isdir = S_ISDIR(mode);
+	flags = au_hi_flags(inode, isdir);
+	bend = au_dbend(dentry);
+	for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) {
+		struct inode *h_i, *h_inode;
+		struct dentry *h_d;
+
+		h_d = au_h_dptr(dentry, bindex);
+		if (!h_d || d_is_negative(h_d))
+			continue;
+
+		h_inode = d_inode(h_d);
+		AuDebugOn(mode != (h_inode->i_mode & S_IFMT));
+		if (iinfo->ii_bstart <= bindex && bindex <= iinfo->ii_bend) {
+			h_i = au_h_iptr(inode, bindex);
+			if (h_i) {
+				if (h_i == h_inode)
+					continue;
+				err = -EIO;
+				break;
+			}
+		}
+		if (bindex < iinfo->ii_bstart)
+			iinfo->ii_bstart = bindex;
+		if (iinfo->ii_bend < bindex)
+			iinfo->ii_bend = bindex;
+		au_set_h_iptr(inode, bindex, au_igrab(h_inode), flags);
+		update = 1;
+	}
+	au_update_ibrange(inode, /*do_put_zero*/0);
+	e = au_dy_irefresh(inode);
+	if (unlikely(e && !err))
+		err = e;
+	if (!err)
+		au_refresh_hinode_attr(inode, update && isdir);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int set_inode(struct inode *inode, struct dentry *dentry)
+{
+	int err;
+	unsigned int flags;
+	umode_t mode;
+	aufs_bindex_t bindex, bstart, btail;
+	unsigned char isdir;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+	struct au_iinfo *iinfo;
+	struct inode_operations *iop;
+
+	IiMustWriteLock(inode);
+
+	err = 0;
+	isdir = 0;
+	iop = au_sbi(inode->i_sb)->si_iop_array;
+	bstart = au_dbstart(dentry);
+	h_dentry = au_h_dptr(dentry, bstart);
+	h_inode = d_inode(h_dentry);
+	mode = h_inode->i_mode;
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		btail = au_dbtail(dentry);
+		inode->i_op = iop + AuIop_OTHER;
+		inode->i_fop = &aufs_file_fop;
+		err = au_dy_iaop(inode, bstart, h_inode);
+		if (unlikely(err))
+			goto out;
+		break;
+	case S_IFDIR:
+		isdir = 1;
+		btail = au_dbtaildir(dentry);
+		inode->i_op = iop + AuIop_DIR;
+		inode->i_fop = &aufs_dir_fop;
+		break;
+	case S_IFLNK:
+		btail = au_dbtail(dentry);
+		inode->i_op = iop + AuIop_SYMLINK;
+		break;
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFIFO:
+	case S_IFSOCK:
+		btail = au_dbtail(dentry);
+		inode->i_op = iop + AuIop_OTHER;
+		init_special_inode(inode, mode, h_inode->i_rdev);
+		break;
+	default:
+		AuIOErr("Unknown file type 0%o\n", mode);
+		err = -EIO;
+		goto out;
+	}
+
+	/* do not set hnotify for whiteouted dirs (SHWH mode) */
+	flags = au_hi_flags(inode, isdir);
+	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)
+	    && au_ftest_hi(flags, HNOTIFY)
+	    && dentry->d_name.len > AUFS_WH_PFX_LEN
+	    && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))
+		au_fclr_hi(flags, HNOTIFY);
+	iinfo = au_ii(inode);
+	iinfo->ii_bstart = bstart;
+	iinfo->ii_bend = btail;
+	for (bindex = bstart; bindex <= btail; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (h_dentry)
+			au_set_h_iptr(inode, bindex,
+				      au_igrab(d_inode(h_dentry)), flags);
+	}
+	au_cpup_attr_all(inode, /*force*/1);
+	/*
+	 * to force calling aufs_get_acl() every time,
+	 * do not call cache_no_acl() for aufs inode.
+	 */
+
+out:
+	return err;
+}
+
+/*
+ * successful returns with iinfo write_locked
+ * minus: errno
+ * zero: success, matched
+ * plus: no error, but unmatched
+ */
+static int reval_inode(struct inode *inode, struct dentry *dentry)
+{
+	int err;
+	unsigned int gen, igflags;
+	aufs_bindex_t bindex, bend;
+	struct inode *h_inode, *h_dinode;
+	struct dentry *h_dentry;
+
+	/*
+	 * before this function, if aufs got any iinfo lock, it must be only
+	 * one, the parent dir.
+	 * it can happen by UDBA and the obsoleted inode number.
+	 */
+	err = -EIO;
+	if (unlikely(inode->i_ino == parent_ino(dentry)))
+		goto out;
+
+	err = 1;
+	ii_write_lock_new_child(inode);
+	h_dentry = au_h_dptr(dentry, au_dbstart(dentry));
+	h_dinode = d_inode(h_dentry);
+	bend = au_ibend(inode);
+	for (bindex = au_ibstart(inode); bindex <= bend; bindex++) {
+		h_inode = au_h_iptr(inode, bindex);
+		if (!h_inode || h_inode != h_dinode)
+			continue;
+
+		err = 0;
+		gen = au_iigen(inode, &igflags);
+		if (gen == au_digen(dentry)
+		    && !au_ig_ftest(igflags, HALF_REFRESHED))
+			break;
+
+		/* fully refresh inode using dentry */
+		err = au_refresh_hinode(inode, dentry);
+		if (!err)
+			au_update_iigen(inode, /*half*/0);
+		break;
+	}
+
+	if (unlikely(err))
+		ii_write_unlock(inode);
+out:
+	return err;
+}
+
+int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+	   unsigned int d_type, ino_t *ino)
+{
+	int err;
+	struct mutex *mtx;
+
+	/* prevent hardlinked inode number from race condition */
+	mtx = NULL;
+	if (d_type != DT_DIR) {
+		mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx;
+		mutex_lock(mtx);
+	}
+	err = au_xino_read(sb, bindex, h_ino, ino);
+	if (unlikely(err))
+		goto out;
+
+	if (!*ino) {
+		err = -EIO;
+		*ino = au_xino_new_ino(sb);
+		if (unlikely(!*ino))
+			goto out;
+		err = au_xino_write(sb, bindex, h_ino, *ino);
+		if (unlikely(err))
+			goto out;
+	}
+
+out:
+	if (mtx)
+		mutex_unlock(mtx);
+	return err;
+}
+
+/* successful returns with iinfo write_locked */
+/* todo: return with unlocked? */
+struct inode *au_new_inode(struct dentry *dentry, int must_new)
+{
+	struct inode *inode, *h_inode;
+	struct dentry *h_dentry;
+	struct super_block *sb;
+	struct mutex *mtx;
+	ino_t h_ino, ino;
+	int err;
+	aufs_bindex_t bstart;
+
+	sb = dentry->d_sb;
+	bstart = au_dbstart(dentry);
+	h_dentry = au_h_dptr(dentry, bstart);
+	h_inode = d_inode(h_dentry);
+	h_ino = h_inode->i_ino;
+
+	/*
+	 * stop 'race'-ing between hardlinks under different
+	 * parents.
+	 */
+	mtx = NULL;
+	if (!d_is_dir(h_dentry))
+		mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx;
+
+new_ino:
+	if (mtx)
+		mutex_lock(mtx);
+	err = au_xino_read(sb, bstart, h_ino, &ino);
+	inode = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	if (!ino) {
+		ino = au_xino_new_ino(sb);
+		if (unlikely(!ino)) {
+			inode = ERR_PTR(-EIO);
+			goto out;
+		}
+	}
+
+	AuDbg("i%lu\n", (unsigned long)ino);
+	inode = au_iget_locked(sb, ino);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW));
+	if (inode->i_state & I_NEW) {
+		/* verbose coding for lock class name */
+		if (unlikely(d_is_symlink(h_dentry)))
+			au_rw_class(&au_ii(inode)->ii_rwsem,
+				    au_lc_key + AuLcSymlink_IIINFO);
+		else if (unlikely(d_is_dir(h_dentry)))
+			au_rw_class(&au_ii(inode)->ii_rwsem,
+				    au_lc_key + AuLcDir_IIINFO);
+		else /* likely */
+			au_rw_class(&au_ii(inode)->ii_rwsem,
+				    au_lc_key + AuLcNonDir_IIINFO);
+
+		ii_write_lock_new_child(inode);
+		err = set_inode(inode, dentry);
+		if (!err) {
+			unlock_new_inode(inode);
+			goto out; /* success */
+		}
+
+		/*
+		 * iget_failed() calls iput(), but we need to call
+		 * ii_write_unlock() after iget_failed(). so dirty hack for
+		 * i_count.
+		 */
+		atomic_inc(&inode->i_count);
+		iget_failed(inode);
+		ii_write_unlock(inode);
+		au_xino_write(sb, bstart, h_ino, /*ino*/0);
+		/* ignore this error */
+		goto out_iput;
+	} else if (!must_new && !IS_DEADDIR(inode) && inode->i_nlink) {
+		/*
+		 * horrible race condition between lookup, readdir and copyup
+		 * (or something).
+		 */
+		if (mtx)
+			mutex_unlock(mtx);
+		err = reval_inode(inode, dentry);
+		if (unlikely(err < 0)) {
+			mtx = NULL;
+			goto out_iput;
+		}
+
+		if (!err) {
+			mtx = NULL;
+			goto out; /* success */
+		} else if (mtx)
+			mutex_lock(mtx);
+	}
+
+	if (unlikely(au_test_fs_unique_ino(h_inode)))
+		AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir,"
+			" b%d, %s, %pd, hi%lu, i%lu.\n",
+			bstart, au_sbtype(h_dentry->d_sb), dentry,
+			(unsigned long)h_ino, (unsigned long)ino);
+	ino = 0;
+	err = au_xino_write(sb, bstart, h_ino, /*ino*/0);
+	if (!err) {
+		iput(inode);
+		if (mtx)
+			mutex_unlock(mtx);
+		goto new_ino;
+	}
+
+out_iput:
+	iput(inode);
+	inode = ERR_PTR(err);
+out:
+	if (mtx)
+		mutex_unlock(mtx);
+	return inode;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_test_ro(struct super_block *sb, aufs_bindex_t bindex,
+	       struct inode *inode)
+{
+	int err;
+	struct inode *hi;
+
+	err = au_br_rdonly(au_sbr(sb, bindex));
+
+	/* pseudo-link after flushed may happen out of bounds */
+	if (!err
+	    && inode
+	    && au_ibstart(inode) <= bindex
+	    && bindex <= au_ibend(inode)) {
+		/*
+		 * permission check is unnecessary since vfsub routine
+		 * will be called later
+		 */
+		hi = au_h_iptr(inode, bindex);
+		if (hi)
+			err = IS_IMMUTABLE(hi) ? -EROFS : 0;
+	}
+
+	return err;
+}
+
+int au_test_h_perm(struct inode *h_inode, int mask)
+{
+	if (uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
+		return 0;
+	return inode_permission(h_inode, mask);
+}
+
+int au_test_h_perm_sio(struct inode *h_inode, int mask)
+{
+	if (au_test_nfs(h_inode->i_sb)
+	    && (mask & MAY_WRITE)
+	    && S_ISDIR(h_inode->i_mode))
+		mask |= MAY_READ; /* force permission check */
+	return au_test_h_perm(h_inode, mask);
+}
diff --git b/fs/aufs/inode.h b/fs/aufs/inode.h
new file mode 100644
index 0000000..e694be4
--- /dev/null
+++ b/fs/aufs/inode.h
@@ -0,0 +1,673 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode operations
+ */
+
+#ifndef __AUFS_INODE_H__
+#define __AUFS_INODE_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fsnotify.h>
+#include "rwsem.h"
+
+struct vfsmount;
+
+struct au_hnotify {
+#ifdef CONFIG_AUFS_HNOTIFY
+#ifdef CONFIG_AUFS_HFSNOTIFY
+	/* never use fsnotify_add_vfsmount_mark() */
+	struct fsnotify_mark		hn_mark;
+#endif
+	struct inode			*hn_aufs_inode;	/* no get/put */
+#endif
+} ____cacheline_aligned_in_smp;
+
+struct au_hinode {
+	struct inode		*hi_inode;
+	aufs_bindex_t		hi_id;
+#ifdef CONFIG_AUFS_HNOTIFY
+	struct au_hnotify	*hi_notify;
+#endif
+
+	/* reference to the copied-up whiteout with get/put */
+	struct dentry		*hi_whdentry;
+};
+
+/* ig_flags */
+#define AuIG_HALF_REFRESHED		1
+#define au_ig_ftest(flags, name)	((flags) & AuIG_##name)
+#define au_ig_fset(flags, name) \
+	do { (flags) |= AuIG_##name; } while (0)
+#define au_ig_fclr(flags, name) \
+	do { (flags) &= ~AuIG_##name; } while (0)
+
+struct au_iigen {
+	spinlock_t	ig_spin;
+	__u32		ig_generation, ig_flags;
+};
+
+struct au_vdir;
+struct au_iinfo {
+	struct au_iigen		ii_generation;
+	struct super_block	*ii_hsb1;	/* no get/put */
+
+	struct au_rwsem		ii_rwsem;
+	aufs_bindex_t		ii_bstart, ii_bend;
+	__u32			ii_higen;
+	struct au_hinode	*ii_hinode;
+	struct au_vdir		*ii_vdir;
+};
+
+struct au_icntnr {
+	struct au_iinfo iinfo;
+	struct inode vfs_inode;
+	struct hlist_node plink;
+} ____cacheline_aligned_in_smp;
+
+/* au_pin flags */
+#define AuPin_DI_LOCKED		1
+#define AuPin_MNT_WRITE		(1 << 1)
+#define au_ftest_pin(flags, name)	((flags) & AuPin_##name)
+#define au_fset_pin(flags, name) \
+	do { (flags) |= AuPin_##name; } while (0)
+#define au_fclr_pin(flags, name) \
+	do { (flags) &= ~AuPin_##name; } while (0)
+
+struct au_pin {
+	/* input */
+	struct dentry *dentry;
+	unsigned int udba;
+	unsigned char lsc_di, lsc_hi, flags;
+	aufs_bindex_t bindex;
+
+	/* output */
+	struct dentry *parent;
+	struct au_hinode *hdir;
+	struct vfsmount *h_mnt;
+
+	/* temporary unlock/relock for copyup */
+	struct dentry *h_dentry, *h_parent;
+	struct au_branch *br;
+	struct task_struct *task;
+};
+
+void au_pin_hdir_unlock(struct au_pin *p);
+int au_pin_hdir_lock(struct au_pin *p);
+int au_pin_hdir_relock(struct au_pin *p);
+void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task);
+void au_pin_hdir_acquire_nest(struct au_pin *p);
+void au_pin_hdir_release(struct au_pin *p);
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct au_iinfo *au_ii(struct inode *inode)
+{
+	struct au_iinfo *iinfo;
+
+	iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo);
+	if (iinfo->ii_hinode)
+		return iinfo;
+	return NULL; /* debugging bad_inode case */
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* inode.c */
+struct inode *au_igrab(struct inode *inode);
+void au_refresh_iop(struct inode *inode, int force_getattr);
+int au_refresh_hinode_self(struct inode *inode);
+int au_refresh_hinode(struct inode *inode, struct dentry *dentry);
+int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+	   unsigned int d_type, ino_t *ino);
+struct inode *au_new_inode(struct dentry *dentry, int must_new);
+int au_test_ro(struct super_block *sb, aufs_bindex_t bindex,
+	       struct inode *inode);
+int au_test_h_perm(struct inode *h_inode, int mask);
+int au_test_h_perm_sio(struct inode *h_inode, int mask);
+
+static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex,
+			    ino_t h_ino, unsigned int d_type, ino_t *ino)
+{
+#ifdef CONFIG_AUFS_SHWH
+	return au_ino(sb, bindex, h_ino, d_type, ino);
+#else
+	return 0;
+#endif
+}
+
+/* i_op.c */
+enum {
+	AuIop_SYMLINK,
+	AuIop_DIR,
+	AuIop_OTHER,
+	AuIop_Last
+};
+extern struct inode_operations aufs_iop[AuIop_Last],
+	aufs_iop_nogetattr[AuIop_Last];
+
+/* au_wr_dir flags */
+#define AuWrDir_ADD_ENTRY	1
+#define AuWrDir_ISDIR		(1 << 1)
+#define AuWrDir_TMPFILE		(1 << 2)
+#define au_ftest_wrdir(flags, name)	((flags) & AuWrDir_##name)
+#define au_fset_wrdir(flags, name) \
+	do { (flags) |= AuWrDir_##name; } while (0)
+#define au_fclr_wrdir(flags, name) \
+	do { (flags) &= ~AuWrDir_##name; } while (0)
+
+struct au_wr_dir_args {
+	aufs_bindex_t force_btgt;
+	unsigned char flags;
+};
+int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry,
+	      struct au_wr_dir_args *args);
+
+struct dentry *au_pinned_h_parent(struct au_pin *pin);
+void au_pin_init(struct au_pin *pin, struct dentry *dentry,
+		 aufs_bindex_t bindex, int lsc_di, int lsc_hi,
+		 unsigned int udba, unsigned char flags);
+int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex,
+	   unsigned int udba, unsigned char flags) __must_check;
+int au_do_pin(struct au_pin *pin) __must_check;
+void au_unpin(struct au_pin *pin);
+int au_reval_for_attr(struct dentry *dentry, unsigned int sigen);
+
+#define AuIcpup_DID_CPUP	1
+#define au_ftest_icpup(flags, name)	((flags) & AuIcpup_##name)
+#define au_fset_icpup(flags, name) \
+	do { (flags) |= AuIcpup_##name; } while (0)
+#define au_fclr_icpup(flags, name) \
+	do { (flags) &= ~AuIcpup_##name; } while (0)
+
+struct au_icpup_args {
+	unsigned char flags;
+	unsigned char pin_flags;
+	aufs_bindex_t btgt;
+	unsigned int udba;
+	struct au_pin pin;
+	struct path h_path;
+	struct inode *h_inode;
+};
+
+int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia,
+		     struct au_icpup_args *a);
+
+int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path);
+
+/* i_op_add.c */
+int au_may_add(struct dentry *dentry, aufs_bindex_t bindex,
+	       struct dentry *h_parent, int isdir);
+int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+	       dev_t dev);
+int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname);
+int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool want_excl);
+struct vfsub_aopen_args;
+int au_aopen_or_create(struct inode *dir, struct dentry *dentry,
+		       struct vfsub_aopen_args *args);
+int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode);
+int aufs_link(struct dentry *src_dentry, struct inode *dir,
+	      struct dentry *dentry);
+int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
+
+/* i_op_del.c */
+int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup);
+int au_may_del(struct dentry *dentry, aufs_bindex_t bindex,
+	       struct dentry *h_parent, int isdir);
+int aufs_unlink(struct inode *dir, struct dentry *dentry);
+int aufs_rmdir(struct inode *dir, struct dentry *dentry);
+
+/* i_op_ren.c */
+int au_wbr(struct dentry *dentry, aufs_bindex_t btgt);
+int aufs_rename(struct inode *src_dir, struct dentry *src_dentry,
+		struct inode *dir, struct dentry *dentry);
+
+/* iinfo.c */
+struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex);
+void au_hiput(struct au_hinode *hinode);
+void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex,
+		  struct dentry *h_wh);
+unsigned int au_hi_flags(struct inode *inode, int isdir);
+
+/* hinode flags */
+#define AuHi_XINO	1
+#define AuHi_HNOTIFY	(1 << 1)
+#define au_ftest_hi(flags, name)	((flags) & AuHi_##name)
+#define au_fset_hi(flags, name) \
+	do { (flags) |= AuHi_##name; } while (0)
+#define au_fclr_hi(flags, name) \
+	do { (flags) &= ~AuHi_##name; } while (0)
+
+#ifndef CONFIG_AUFS_HNOTIFY
+#undef AuHi_HNOTIFY
+#define AuHi_HNOTIFY	0
+#endif
+
+void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex,
+		   struct inode *h_inode, unsigned int flags);
+
+void au_update_iigen(struct inode *inode, int half);
+void au_update_ibrange(struct inode *inode, int do_put_zero);
+
+void au_icntnr_init_once(void *_c);
+int au_iinfo_init(struct inode *inode);
+void au_iinfo_fin(struct inode *inode);
+int au_ii_realloc(struct au_iinfo *iinfo, int nbr);
+
+#ifdef CONFIG_PROC_FS
+/* plink.c */
+int au_plink_maint(struct super_block *sb, int flags);
+struct au_sbinfo;
+void au_plink_maint_leave(struct au_sbinfo *sbinfo);
+int au_plink_maint_enter(struct super_block *sb);
+#ifdef CONFIG_AUFS_DEBUG
+void au_plink_list(struct super_block *sb);
+#else
+AuStubVoid(au_plink_list, struct super_block *sb)
+#endif
+int au_plink_test(struct inode *inode);
+struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex);
+void au_plink_append(struct inode *inode, aufs_bindex_t bindex,
+		     struct dentry *h_dentry);
+void au_plink_put(struct super_block *sb, int verbose);
+void au_plink_clean(struct super_block *sb, int verbose);
+void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id);
+#else
+AuStubInt0(au_plink_maint, struct super_block *sb, int flags);
+AuStubVoid(au_plink_maint_leave, struct au_sbinfo *sbinfo);
+AuStubInt0(au_plink_maint_enter, struct super_block *sb);
+AuStubVoid(au_plink_list, struct super_block *sb);
+AuStubInt0(au_plink_test, struct inode *inode);
+AuStub(struct dentry *, au_plink_lkup, return NULL,
+       struct inode *inode, aufs_bindex_t bindex);
+AuStubVoid(au_plink_append, struct inode *inode, aufs_bindex_t bindex,
+	   struct dentry *h_dentry);
+AuStubVoid(au_plink_put, struct super_block *sb, int verbose);
+AuStubVoid(au_plink_clean, struct super_block *sb, int verbose);
+AuStubVoid(au_plink_half_refresh, struct super_block *sb, aufs_bindex_t br_id);
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_AUFS_XATTR
+/* xattr.c */
+int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags,
+		  unsigned int verbose);
+ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size);
+ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value,
+		      size_t size);
+int aufs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		  size_t size, int flags);
+int aufs_removexattr(struct dentry *dentry, const char *name);
+
+/* void au_xattr_init(struct super_block *sb); */
+#else
+AuStubInt0(au_cpup_xattr, struct dentry *h_dst, struct dentry *h_src,
+	   int ignore_flags, unsigned int verbose);
+/* AuStubVoid(au_xattr_init, struct super_block *sb); */
+#endif
+
+#ifdef CONFIG_FS_POSIX_ACL
+struct posix_acl *aufs_get_acl(struct inode *inode, int type);
+int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+#endif
+
+#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL)
+enum {
+	AU_XATTR_SET,
+	AU_XATTR_REMOVE,
+	AU_ACL_SET
+};
+
+struct au_srxattr {
+	int type;
+	union {
+		struct {
+			const char	*name;
+			const void	*value;
+			size_t		size;
+			int		flags;
+		} set;
+		struct {
+			const char	*name;
+		} remove;
+		struct {
+			struct posix_acl *acl;
+			int		type;
+		} acl_set;
+	} u;
+};
+ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg);
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+/* lock subclass for iinfo */
+enum {
+	AuLsc_II_CHILD,		/* child first */
+	AuLsc_II_CHILD2,	/* rename(2), link(2), and cpup at hnotify */
+	AuLsc_II_CHILD3,	/* copyup dirs */
+	AuLsc_II_PARENT,	/* see AuLsc_I_PARENT in vfsub.h */
+	AuLsc_II_PARENT2,
+	AuLsc_II_PARENT3,	/* copyup dirs */
+	AuLsc_II_NEW_CHILD
+};
+
+/*
+ * ii_read_lock_child, ii_write_lock_child,
+ * ii_read_lock_child2, ii_write_lock_child2,
+ * ii_read_lock_child3, ii_write_lock_child3,
+ * ii_read_lock_parent, ii_write_lock_parent,
+ * ii_read_lock_parent2, ii_write_lock_parent2,
+ * ii_read_lock_parent3, ii_write_lock_parent3,
+ * ii_read_lock_new_child, ii_write_lock_new_child,
+ */
+#define AuReadLockFunc(name, lsc) \
+static inline void ii_read_lock_##name(struct inode *i) \
+{ \
+	au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \
+}
+
+#define AuWriteLockFunc(name, lsc) \
+static inline void ii_write_lock_##name(struct inode *i) \
+{ \
+	au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \
+}
+
+#define AuRWLockFuncs(name, lsc) \
+	AuReadLockFunc(name, lsc) \
+	AuWriteLockFunc(name, lsc)
+
+AuRWLockFuncs(child, CHILD);
+AuRWLockFuncs(child2, CHILD2);
+AuRWLockFuncs(child3, CHILD3);
+AuRWLockFuncs(parent, PARENT);
+AuRWLockFuncs(parent2, PARENT2);
+AuRWLockFuncs(parent3, PARENT3);
+AuRWLockFuncs(new_child, NEW_CHILD);
+
+#undef AuReadLockFunc
+#undef AuWriteLockFunc
+#undef AuRWLockFuncs
+
+/*
+ * ii_read_unlock, ii_write_unlock, ii_downgrade_lock
+ */
+AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem);
+
+#define IiMustNoWaiters(i)	AuRwMustNoWaiters(&au_ii(i)->ii_rwsem)
+#define IiMustAnyLock(i)	AuRwMustAnyLock(&au_ii(i)->ii_rwsem)
+#define IiMustWriteLock(i)	AuRwMustWriteLock(&au_ii(i)->ii_rwsem)
+
+/* ---------------------------------------------------------------------- */
+
+static inline void au_icntnr_init(struct au_icntnr *c)
+{
+#ifdef CONFIG_AUFS_DEBUG
+	c->vfs_inode.i_mode = 0;
+#endif
+}
+
+static inline unsigned int au_iigen(struct inode *inode, unsigned int *igflags)
+{
+	unsigned int gen;
+	struct au_iinfo *iinfo;
+	struct au_iigen *iigen;
+
+	iinfo = au_ii(inode);
+	iigen = &iinfo->ii_generation;
+	spin_lock(&iigen->ig_spin);
+	if (igflags)
+		*igflags = iigen->ig_flags;
+	gen = iigen->ig_generation;
+	spin_unlock(&iigen->ig_spin);
+
+	return gen;
+}
+
+/* tiny test for inode number */
+/* tmpfs generation is too rough */
+static inline int au_test_higen(struct inode *inode, struct inode *h_inode)
+{
+	struct au_iinfo *iinfo;
+
+	iinfo = au_ii(inode);
+	AuRwMustAnyLock(&iinfo->ii_rwsem);
+	return !(iinfo->ii_hsb1 == h_inode->i_sb
+		 && iinfo->ii_higen == h_inode->i_generation);
+}
+
+static inline void au_iigen_dec(struct inode *inode)
+{
+	struct au_iinfo *iinfo;
+	struct au_iigen *iigen;
+
+	iinfo = au_ii(inode);
+	iigen = &iinfo->ii_generation;
+	spin_lock(&iigen->ig_spin);
+	iigen->ig_generation--;
+	spin_unlock(&iigen->ig_spin);
+}
+
+static inline int au_iigen_test(struct inode *inode, unsigned int sigen)
+{
+	int err;
+
+	err = 0;
+	if (unlikely(inode && au_iigen(inode, NULL) != sigen))
+		err = -EIO;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static inline aufs_bindex_t au_ii_br_id(struct inode *inode,
+					aufs_bindex_t bindex)
+{
+	IiMustAnyLock(inode);
+	return au_ii(inode)->ii_hinode[0 + bindex].hi_id;
+}
+
+static inline aufs_bindex_t au_ibstart(struct inode *inode)
+{
+	IiMustAnyLock(inode);
+	return au_ii(inode)->ii_bstart;
+}
+
+static inline aufs_bindex_t au_ibend(struct inode *inode)
+{
+	IiMustAnyLock(inode);
+	return au_ii(inode)->ii_bend;
+}
+
+static inline struct au_vdir *au_ivdir(struct inode *inode)
+{
+	IiMustAnyLock(inode);
+	return au_ii(inode)->ii_vdir;
+}
+
+static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex)
+{
+	IiMustAnyLock(inode);
+	return au_ii(inode)->ii_hinode[0 + bindex].hi_whdentry;
+}
+
+static inline void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex)
+{
+	IiMustWriteLock(inode);
+	au_ii(inode)->ii_bstart = bindex;
+}
+
+static inline void au_set_ibend(struct inode *inode, aufs_bindex_t bindex)
+{
+	IiMustWriteLock(inode);
+	au_ii(inode)->ii_bend = bindex;
+}
+
+static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir)
+{
+	IiMustWriteLock(inode);
+	au_ii(inode)->ii_vdir = vdir;
+}
+
+static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex)
+{
+	IiMustAnyLock(inode);
+	return au_ii(inode)->ii_hinode + bindex;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct dentry *au_pinned_parent(struct au_pin *pin)
+{
+	if (pin)
+		return pin->parent;
+	return NULL;
+}
+
+static inline struct inode *au_pinned_h_dir(struct au_pin *pin)
+{
+	if (pin && pin->hdir)
+		return pin->hdir->hi_inode;
+	return NULL;
+}
+
+static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin)
+{
+	if (pin)
+		return pin->hdir;
+	return NULL;
+}
+
+static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry)
+{
+	if (pin)
+		pin->dentry = dentry;
+}
+
+static inline void au_pin_set_parent_lflag(struct au_pin *pin,
+					   unsigned char lflag)
+{
+	if (pin) {
+		if (lflag)
+			au_fset_pin(pin->flags, DI_LOCKED);
+		else
+			au_fclr_pin(pin->flags, DI_LOCKED);
+	}
+}
+
+#if 0 /* reserved */
+static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent)
+{
+	if (pin) {
+		dput(pin->parent);
+		pin->parent = dget(parent);
+	}
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+struct au_branch;
+#ifdef CONFIG_AUFS_HNOTIFY
+struct au_hnotify_op {
+	void (*ctl)(struct au_hinode *hinode, int do_set);
+	int (*alloc)(struct au_hinode *hinode);
+
+	/*
+	 * if it returns true, the the caller should free hinode->hi_notify,
+	 * otherwise ->free() frees it.
+	 */
+	int (*free)(struct au_hinode *hinode,
+		    struct au_hnotify *hn) __must_check;
+
+	void (*fin)(void);
+	int (*init)(void);
+
+	int (*reset_br)(unsigned int udba, struct au_branch *br, int perm);
+	void (*fin_br)(struct au_branch *br);
+	int (*init_br)(struct au_branch *br, int perm);
+};
+
+/* hnotify.c */
+int au_hn_alloc(struct au_hinode *hinode, struct inode *inode);
+void au_hn_free(struct au_hinode *hinode);
+void au_hn_ctl(struct au_hinode *hinode, int do_set);
+void au_hn_reset(struct inode *inode, unsigned int flags);
+int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask,
+	       struct qstr *h_child_qstr, struct inode *h_child_inode);
+int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm);
+int au_hnotify_init_br(struct au_branch *br, int perm);
+void au_hnotify_fin_br(struct au_branch *br);
+int __init au_hnotify_init(void);
+void au_hnotify_fin(void);
+
+/* hfsnotify.c */
+extern const struct au_hnotify_op au_hnotify_op;
+
+static inline
+void au_hn_init(struct au_hinode *hinode)
+{
+	hinode->hi_notify = NULL;
+}
+
+static inline struct au_hnotify *au_hn(struct au_hinode *hinode)
+{
+	return hinode->hi_notify;
+}
+
+#else
+AuStub(int, au_hn_alloc, return -EOPNOTSUPP,
+       struct au_hinode *hinode __maybe_unused,
+       struct inode *inode __maybe_unused)
+AuStub(struct au_hnotify *, au_hn, return NULL, struct au_hinode *hinode)
+AuStubVoid(au_hn_free, struct au_hinode *hinode __maybe_unused)
+AuStubVoid(au_hn_ctl, struct au_hinode *hinode __maybe_unused,
+	   int do_set __maybe_unused)
+AuStubVoid(au_hn_reset, struct inode *inode __maybe_unused,
+	   unsigned int flags __maybe_unused)
+AuStubInt0(au_hnotify_reset_br, unsigned int udba __maybe_unused,
+	   struct au_branch *br __maybe_unused,
+	   int perm __maybe_unused)
+AuStubInt0(au_hnotify_init_br, struct au_branch *br __maybe_unused,
+	   int perm __maybe_unused)
+AuStubVoid(au_hnotify_fin_br, struct au_branch *br __maybe_unused)
+AuStubInt0(__init au_hnotify_init, void)
+AuStubVoid(au_hnotify_fin, void)
+AuStubVoid(au_hn_init, struct au_hinode *hinode __maybe_unused)
+#endif /* CONFIG_AUFS_HNOTIFY */
+
+static inline void au_hn_suspend(struct au_hinode *hdir)
+{
+	au_hn_ctl(hdir, /*do_set*/0);
+}
+
+static inline void au_hn_resume(struct au_hinode *hdir)
+{
+	au_hn_ctl(hdir, /*do_set*/1);
+}
+
+static inline void au_hn_imtx_lock(struct au_hinode *hdir)
+{
+	inode_lock(hdir->hi_inode);
+	au_hn_suspend(hdir);
+}
+
+static inline void au_hn_imtx_lock_nested(struct au_hinode *hdir,
+					  unsigned int sc __maybe_unused)
+{
+	inode_lock_nested(hdir->hi_inode, sc);
+	au_hn_suspend(hdir);
+}
+
+static inline void au_hn_imtx_unlock(struct au_hinode *hdir)
+{
+	au_hn_resume(hdir);
+	inode_unlock(hdir->hi_inode);
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_INODE_H__ */
diff --git b/fs/aufs/ioctl.c b/fs/aufs/ioctl.c
new file mode 100644
index 0000000..6528fb9
--- /dev/null
+++ b/fs/aufs/ioctl.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * ioctl
+ * plink-management and readdir in userspace.
+ * assist the pathconf(3) wrapper library.
+ * move-down
+ * File-based Hierarchical Storage Management.
+ */
+
+#include <linux/compat.h>
+#include <linux/file.h>
+#include "aufs.h"
+
+static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg)
+{
+	int err, fd;
+	aufs_bindex_t wbi, bindex, bend;
+	struct file *h_file;
+	struct super_block *sb;
+	struct dentry *root;
+	struct au_branch *br;
+	struct aufs_wbr_fd wbrfd = {
+		.oflags	= au_dir_roflags,
+		.brid	= -1
+	};
+	const int valid = O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY
+		| O_NOATIME | O_CLOEXEC;
+
+	AuDebugOn(wbrfd.oflags & ~valid);
+
+	if (arg) {
+		err = copy_from_user(&wbrfd, arg, sizeof(wbrfd));
+		if (unlikely(err)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		err = -EINVAL;
+		AuDbg("wbrfd{0%o, %d}\n", wbrfd.oflags, wbrfd.brid);
+		wbrfd.oflags |= au_dir_roflags;
+		AuDbg("0%o\n", wbrfd.oflags);
+		if (unlikely(wbrfd.oflags & ~valid))
+			goto out;
+	}
+
+	fd = get_unused_fd_flags(0);
+	err = fd;
+	if (unlikely(fd < 0))
+		goto out;
+
+	h_file = ERR_PTR(-EINVAL);
+	wbi = 0;
+	br = NULL;
+	sb = path->dentry->d_sb;
+	root = sb->s_root;
+	aufs_read_lock(root, AuLock_IR);
+	bend = au_sbend(sb);
+	if (wbrfd.brid >= 0) {
+		wbi = au_br_index(sb, wbrfd.brid);
+		if (unlikely(wbi < 0 || wbi > bend))
+			goto out_unlock;
+	}
+
+	h_file = ERR_PTR(-ENOENT);
+	br = au_sbr(sb, wbi);
+	if (!au_br_writable(br->br_perm)) {
+		if (arg)
+			goto out_unlock;
+
+		bindex = wbi + 1;
+		wbi = -1;
+		for (; bindex <= bend; bindex++) {
+			br = au_sbr(sb, bindex);
+			if (au_br_writable(br->br_perm)) {
+				wbi = bindex;
+				br = au_sbr(sb, wbi);
+				break;
+			}
+		}
+	}
+	AuDbg("wbi %d\n", wbi);
+	if (wbi >= 0)
+		h_file = au_h_open(root, wbi, wbrfd.oflags, NULL,
+				   /*force_wr*/0);
+
+out_unlock:
+	aufs_read_unlock(root, AuLock_IR);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out_fd;
+
+	atomic_dec(&br->br_count); /* cf. au_h_open() */
+	fd_install(fd, h_file);
+	err = fd;
+	goto out; /* success */
+
+out_fd:
+	put_unused_fd(fd);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long err;
+	struct dentry *dentry;
+
+	switch (cmd) {
+	case AUFS_CTL_RDU:
+	case AUFS_CTL_RDU_INO:
+		err = au_rdu_ioctl(file, cmd, arg);
+		break;
+
+	case AUFS_CTL_WBR_FD:
+		err = au_wbr_fd(&file->f_path, (void __user *)arg);
+		break;
+
+	case AUFS_CTL_IBUSY:
+		err = au_ibusy_ioctl(file, arg);
+		break;
+
+	case AUFS_CTL_BRINFO:
+		err = au_brinfo_ioctl(file, arg);
+		break;
+
+	case AUFS_CTL_FHSM_FD:
+		dentry = file->f_path.dentry;
+		if (IS_ROOT(dentry))
+			err = au_fhsm_fd(dentry->d_sb, arg);
+		else
+			err = -ENOTTY;
+		break;
+
+	default:
+		/* do not call the lower */
+		AuDbg("0x%x\n", cmd);
+		err = -ENOTTY;
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long err;
+
+	switch (cmd) {
+	case AUFS_CTL_MVDOWN:
+		err = au_mvdown(file->f_path.dentry, (void __user *)arg);
+		break;
+
+	case AUFS_CTL_WBR_FD:
+		err = au_wbr_fd(&file->f_path, (void __user *)arg);
+		break;
+
+	default:
+		/* do not call the lower */
+		AuDbg("0x%x\n", cmd);
+		err = -ENOTTY;
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	long err;
+
+	switch (cmd) {
+	case AUFS_CTL_RDU:
+	case AUFS_CTL_RDU_INO:
+		err = au_rdu_compat_ioctl(file, cmd, arg);
+		break;
+
+	case AUFS_CTL_IBUSY:
+		err = au_ibusy_compat_ioctl(file, arg);
+		break;
+
+	case AUFS_CTL_BRINFO:
+		err = au_brinfo_compat_ioctl(file, arg);
+		break;
+
+	default:
+		err = aufs_ioctl_dir(file, cmd, arg);
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd,
+			      unsigned long arg)
+{
+	return aufs_ioctl_nondir(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git b/fs/aufs/loop.c b/fs/aufs/loop.c
new file mode 100644
index 0000000..5711e7a
--- /dev/null
+++ b/fs/aufs/loop.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * support for loopback block device as a branch
+ */
+
+#include "aufs.h"
+
+/* added into drivers/block/loop.c */
+static struct file *(*backing_file_func)(struct super_block *sb);
+
+/*
+ * test if two lower dentries have overlapping branches.
+ */
+int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding)
+{
+	struct super_block *h_sb;
+	struct file *backing_file;
+
+	if (unlikely(!backing_file_func)) {
+		/* don't load "loop" module here */
+		backing_file_func = symbol_get(loop_backing_file);
+		if (unlikely(!backing_file_func))
+			/* "loop" module is not loaded */
+			return 0;
+	}
+
+	h_sb = h_adding->d_sb;
+	backing_file = backing_file_func(h_sb);
+	if (!backing_file)
+		return 0;
+
+	h_adding = backing_file->f_path.dentry;
+	/*
+	 * h_adding can be local NFS.
+	 * in this case aufs cannot detect the loop.
+	 */
+	if (unlikely(h_adding->d_sb == sb))
+		return 1;
+	return !!au_test_subdir(h_adding, sb->s_root);
+}
+
+/* true if a kernel thread named 'loop[0-9].*' accesses a file */
+int au_test_loopback_kthread(void)
+{
+	int ret;
+	struct task_struct *tsk = current;
+	char c, comm[sizeof(tsk->comm)];
+
+	ret = 0;
+	if (tsk->flags & PF_KTHREAD) {
+		get_task_comm(comm, tsk);
+		c = comm[4];
+		ret = ('0' <= c && c <= '9'
+		       && !strncmp(comm, "loop", 4));
+	}
+
+	return ret;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#define au_warn_loopback_step	16
+static int au_warn_loopback_nelem = au_warn_loopback_step;
+static unsigned long *au_warn_loopback_array;
+
+void au_warn_loopback(struct super_block *h_sb)
+{
+	int i, new_nelem;
+	unsigned long *a, magic;
+	static DEFINE_SPINLOCK(spin);
+
+	magic = h_sb->s_magic;
+	spin_lock(&spin);
+	a = au_warn_loopback_array;
+	for (i = 0; i < au_warn_loopback_nelem && *a; i++)
+		if (a[i] == magic) {
+			spin_unlock(&spin);
+			return;
+		}
+
+	/* h_sb is new to us, print it */
+	if (i < au_warn_loopback_nelem) {
+		a[i] = magic;
+		goto pr;
+	}
+
+	/* expand the array */
+	new_nelem = au_warn_loopback_nelem + au_warn_loopback_step;
+	a = au_kzrealloc(au_warn_loopback_array,
+			 au_warn_loopback_nelem * sizeof(unsigned long),
+			 new_nelem * sizeof(unsigned long), GFP_ATOMIC);
+	if (a) {
+		au_warn_loopback_nelem = new_nelem;
+		au_warn_loopback_array = a;
+		a[i] = magic;
+		goto pr;
+	}
+
+	spin_unlock(&spin);
+	AuWarn1("realloc failed, ignored\n");
+	return;
+
+pr:
+	spin_unlock(&spin);
+	pr_warn("you may want to try another patch for loopback file "
+		"on %s(0x%lx) branch\n", au_sbtype(h_sb), magic);
+}
+
+int au_loopback_init(void)
+{
+	int err;
+	struct super_block *sb __maybe_unused;
+
+	BUILD_BUG_ON(sizeof(sb->s_magic) != sizeof(unsigned long));
+
+	err = 0;
+	au_warn_loopback_array = kcalloc(au_warn_loopback_step,
+					 sizeof(unsigned long), GFP_NOFS);
+	if (unlikely(!au_warn_loopback_array))
+		err = -ENOMEM;
+
+	return err;
+}
+
+void au_loopback_fin(void)
+{
+	if (backing_file_func)
+		symbol_put(loop_backing_file);
+	kfree(au_warn_loopback_array);
+}
diff --git b/fs/aufs/loop.h b/fs/aufs/loop.h
new file mode 100644
index 0000000..48bf070
--- /dev/null
+++ b/fs/aufs/loop.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * support for loopback mount as a branch
+ */
+
+#ifndef __AUFS_LOOP_H__
+#define __AUFS_LOOP_H__
+
+#ifdef __KERNEL__
+
+struct dentry;
+struct super_block;
+
+#ifdef CONFIG_AUFS_BDEV_LOOP
+/* drivers/block/loop.c */
+struct file *loop_backing_file(struct super_block *sb);
+
+/* loop.c */
+int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding);
+int au_test_loopback_kthread(void);
+void au_warn_loopback(struct super_block *h_sb);
+
+int au_loopback_init(void);
+void au_loopback_fin(void);
+#else
+AuStubInt0(au_test_loopback_overlap, struct super_block *sb,
+	   struct dentry *h_adding)
+AuStubInt0(au_test_loopback_kthread, void)
+AuStubVoid(au_warn_loopback, struct super_block *h_sb)
+
+AuStubInt0(au_loopback_init, void)
+AuStubVoid(au_loopback_fin, void)
+#endif /* BLK_DEV_LOOP */
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_LOOP_H__ */
diff --git b/fs/aufs/magic.mk b/fs/aufs/magic.mk
new file mode 100644
index 0000000..4f83bdf
--- /dev/null
+++ b/fs/aufs/magic.mk
@@ -0,0 +1,30 @@
+
+# defined in ${srctree}/fs/fuse/inode.c
+# tristate
+ifdef CONFIG_FUSE_FS
+ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546
+endif
+
+# defined in ${srctree}/fs/xfs/xfs_sb.h
+# tristate
+ifdef CONFIG_XFS_FS
+ccflags-y += -DXFS_SB_MAGIC=0x58465342
+endif
+
+# defined in ${srctree}/fs/configfs/mount.c
+# tristate
+ifdef CONFIG_CONFIGFS_FS
+ccflags-y += -DCONFIGFS_MAGIC=0x62656570
+endif
+
+# defined in ${srctree}/fs/ubifs/ubifs.h
+# tristate
+ifdef CONFIG_UBIFS_FS
+ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905
+endif
+
+# defined in ${srctree}/fs/hfsplus/hfsplus_raw.h
+# tristate
+ifdef CONFIG_HFSPLUS_FS
+ccflags-y += -DHFSPLUS_SUPER_MAGIC=0x482b
+endif
diff --git b/fs/aufs/module.c b/fs/aufs/module.c
new file mode 100644
index 0000000..317fde0
--- /dev/null
+++ b/fs/aufs/module.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * module global variables and operations
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include "aufs.h"
+
+void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp)
+{
+	if (new_sz <= nused)
+		return p;
+
+	p = krealloc(p, new_sz, gfp);
+	if (p)
+		memset(p + nused, 0, new_sz - nused);
+	return p;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * aufs caches
+ */
+struct kmem_cache *au_cachep[AuCache_Last];
+static int __init au_cache_init(void)
+{
+	au_cachep[AuCache_DINFO] = AuCacheCtor(au_dinfo, au_di_init_once);
+	if (au_cachep[AuCache_DINFO])
+		/* SLAB_DESTROY_BY_RCU */
+		au_cachep[AuCache_ICNTNR] = AuCacheCtor(au_icntnr,
+							au_icntnr_init_once);
+	if (au_cachep[AuCache_ICNTNR])
+		au_cachep[AuCache_FINFO] = AuCacheCtor(au_finfo,
+						       au_fi_init_once);
+	if (au_cachep[AuCache_FINFO])
+		au_cachep[AuCache_VDIR] = AuCache(au_vdir);
+	if (au_cachep[AuCache_VDIR])
+		au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr);
+	if (au_cachep[AuCache_DEHSTR])
+		return 0;
+
+	return -ENOMEM;
+}
+
+static void au_cache_fin(void)
+{
+	int i;
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
+	/* excluding AuCache_HNOTIFY */
+	BUILD_BUG_ON(AuCache_HNOTIFY + 1 != AuCache_Last);
+	for (i = 0; i < AuCache_HNOTIFY; i++) {
+		kmem_cache_destroy(au_cachep[i]);
+		au_cachep[i] = NULL;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_dir_roflags;
+
+#ifdef CONFIG_AUFS_SBILIST
+/*
+ * iterate_supers_type() doesn't protect us from
+ * remounting (branch management)
+ */
+struct au_sphlhead au_sbilist;
+#endif
+
+struct lock_class_key au_lc_key[AuLcKey_Last];
+
+/*
+ * functions for module interface.
+ */
+MODULE_LICENSE("GPL");
+/* MODULE_LICENSE("GPL v2"); */
+MODULE_AUTHOR("Junjiro R. Okajima <aufs-users@lists.sourceforge.net>");
+MODULE_DESCRIPTION(AUFS_NAME
+	" -- Advanced multi layered unification filesystem");
+MODULE_VERSION(AUFS_VERSION);
+
+/* this module parameter has no meaning when SYSFS is disabled */
+int sysaufs_brs = 1;
+MODULE_PARM_DESC(brs, "use <sysfs>/fs/aufs/si_*/brN");
+module_param_named(brs, sysaufs_brs, int, S_IRUGO);
+
+/* this module parameter has no meaning when USER_NS is disabled */
+bool au_userns;
+MODULE_PARM_DESC(allow_userns, "allow unprivileged to mount under userns");
+module_param_named(allow_userns, au_userns, bool, S_IRUGO);
+
+/* ---------------------------------------------------------------------- */
+
+static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */
+
+int au_seq_path(struct seq_file *seq, struct path *path)
+{
+	int err;
+
+	err = seq_path(seq, path, au_esc_chars);
+	if (err > 0)
+		err = 0;
+	else if (err < 0)
+		err = -ENOMEM;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int __init aufs_init(void)
+{
+	int err, i;
+	char *p;
+
+	p = au_esc_chars;
+	for (i = 1; i <= ' '; i++)
+		*p++ = i;
+	*p++ = '\\';
+	*p++ = '\x7f';
+	*p = 0;
+
+	au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE);
+
+	memcpy(aufs_iop_nogetattr, aufs_iop, sizeof(aufs_iop));
+	for (i = 0; i < AuIop_Last; i++)
+		aufs_iop_nogetattr[i].getattr = NULL;
+
+	au_sbilist_init();
+	sysaufs_brs_init();
+	au_debug_init();
+	au_dy_init();
+	err = sysaufs_init();
+	if (unlikely(err))
+		goto out;
+	err = au_procfs_init();
+	if (unlikely(err))
+		goto out_sysaufs;
+	err = au_wkq_init();
+	if (unlikely(err))
+		goto out_procfs;
+	err = au_loopback_init();
+	if (unlikely(err))
+		goto out_wkq;
+	err = au_hnotify_init();
+	if (unlikely(err))
+		goto out_loopback;
+	err = au_sysrq_init();
+	if (unlikely(err))
+		goto out_hin;
+	err = au_cache_init();
+	if (unlikely(err))
+		goto out_sysrq;
+
+	aufs_fs_type.fs_flags |= au_userns ? FS_USERNS_MOUNT : 0;
+	err = register_filesystem(&aufs_fs_type);
+	if (unlikely(err))
+		goto out_cache;
+
+	/* since we define pr_fmt, call printk directly */
+	printk(KERN_INFO AUFS_NAME " " AUFS_VERSION "\n");
+	goto out; /* success */
+
+out_cache:
+	au_cache_fin();
+out_sysrq:
+	au_sysrq_fin();
+out_hin:
+	au_hnotify_fin();
+out_loopback:
+	au_loopback_fin();
+out_wkq:
+	au_wkq_fin();
+out_procfs:
+	au_procfs_fin();
+out_sysaufs:
+	sysaufs_fin();
+	au_dy_fin();
+out:
+	return err;
+}
+
+static void __exit aufs_exit(void)
+{
+	unregister_filesystem(&aufs_fs_type);
+	au_cache_fin();
+	au_sysrq_fin();
+	au_hnotify_fin();
+	au_loopback_fin();
+	au_wkq_fin();
+	au_procfs_fin();
+	sysaufs_fin();
+	au_dy_fin();
+}
+
+module_init(aufs_init);
+module_exit(aufs_exit);
diff --git b/fs/aufs/module.h b/fs/aufs/module.h
new file mode 100644
index 0000000..bb86447
--- /dev/null
+++ b/fs/aufs/module.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * module initialization and module-global
+ */
+
+#ifndef __AUFS_MODULE_H__
+#define __AUFS_MODULE_H__
+
+#ifdef __KERNEL__
+
+#include <linux/slab.h>
+
+struct path;
+struct seq_file;
+
+/* module parameters */
+extern int sysaufs_brs;
+extern bool au_userns;
+
+/* ---------------------------------------------------------------------- */
+
+extern int au_dir_roflags;
+
+enum {
+	AuLcNonDir_FIINFO,
+	AuLcNonDir_DIINFO,
+	AuLcNonDir_IIINFO,
+
+	AuLcDir_FIINFO,
+	AuLcDir_DIINFO,
+	AuLcDir_IIINFO,
+
+	AuLcSymlink_DIINFO,
+	AuLcSymlink_IIINFO,
+
+	AuLcKey_Last
+};
+extern struct lock_class_key au_lc_key[AuLcKey_Last];
+
+void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp);
+int au_seq_path(struct seq_file *seq, struct path *path);
+
+#ifdef CONFIG_PROC_FS
+/* procfs.c */
+int __init au_procfs_init(void);
+void au_procfs_fin(void);
+#else
+AuStubInt0(au_procfs_init, void);
+AuStubVoid(au_procfs_fin, void);
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+/* kmem cache */
+enum {
+	AuCache_DINFO,
+	AuCache_ICNTNR,
+	AuCache_FINFO,
+	AuCache_VDIR,
+	AuCache_DEHSTR,
+	AuCache_HNOTIFY, /* must be last */
+	AuCache_Last
+};
+
+#define AuCacheFlags		(SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD)
+#define AuCache(type)		KMEM_CACHE(type, AuCacheFlags)
+#define AuCacheCtor(type, ctor)	\
+	kmem_cache_create(#type, sizeof(struct type), \
+			  __alignof__(struct type), AuCacheFlags, ctor)
+
+extern struct kmem_cache *au_cachep[];
+
+#define AuCacheFuncs(name, index) \
+static inline struct au_##name *au_cache_alloc_##name(void) \
+{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \
+static inline void au_cache_free_##name(struct au_##name *p) \
+{ kmem_cache_free(au_cachep[AuCache_##index], p); }
+
+AuCacheFuncs(dinfo, DINFO);
+AuCacheFuncs(icntnr, ICNTNR);
+AuCacheFuncs(finfo, FINFO);
+AuCacheFuncs(vdir, VDIR);
+AuCacheFuncs(vdir_dehstr, DEHSTR);
+#ifdef CONFIG_AUFS_HNOTIFY
+AuCacheFuncs(hnotify, HNOTIFY);
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_MODULE_H__ */
diff --git b/fs/aufs/mvdown.c b/fs/aufs/mvdown.c
new file mode 100644
index 0000000..d841542
--- /dev/null
+++ b/fs/aufs/mvdown.c
@@ -0,0 +1,690 @@
+/*
+ * Copyright (C) 2011-2016 Junjiro R. Okajima
+ */
+
+/*
+ * move-down, opposite of copy-up
+ */
+
+#include "aufs.h"
+
+struct au_mvd_args {
+	struct {
+		struct super_block *h_sb;
+		struct dentry *h_parent;
+		struct au_hinode *hdir;
+		struct inode *h_dir, *h_inode;
+		struct au_pin pin;
+	} info[AUFS_MVDOWN_NARRAY];
+
+	struct aufs_mvdown mvdown;
+	struct dentry *dentry, *parent;
+	struct inode *inode, *dir;
+	struct super_block *sb;
+	aufs_bindex_t bopq, bwh, bfound;
+	unsigned char rename_lock;
+};
+
+#define mvd_errno		mvdown.au_errno
+#define mvd_bsrc		mvdown.stbr[AUFS_MVDOWN_UPPER].bindex
+#define mvd_src_brid		mvdown.stbr[AUFS_MVDOWN_UPPER].brid
+#define mvd_bdst		mvdown.stbr[AUFS_MVDOWN_LOWER].bindex
+#define mvd_dst_brid		mvdown.stbr[AUFS_MVDOWN_LOWER].brid
+
+#define mvd_h_src_sb		info[AUFS_MVDOWN_UPPER].h_sb
+#define mvd_h_src_parent	info[AUFS_MVDOWN_UPPER].h_parent
+#define mvd_hdir_src		info[AUFS_MVDOWN_UPPER].hdir
+#define mvd_h_src_dir		info[AUFS_MVDOWN_UPPER].h_dir
+#define mvd_h_src_inode		info[AUFS_MVDOWN_UPPER].h_inode
+#define mvd_pin_src		info[AUFS_MVDOWN_UPPER].pin
+
+#define mvd_h_dst_sb		info[AUFS_MVDOWN_LOWER].h_sb
+#define mvd_h_dst_parent	info[AUFS_MVDOWN_LOWER].h_parent
+#define mvd_hdir_dst		info[AUFS_MVDOWN_LOWER].hdir
+#define mvd_h_dst_dir		info[AUFS_MVDOWN_LOWER].h_dir
+#define mvd_h_dst_inode		info[AUFS_MVDOWN_LOWER].h_inode
+#define mvd_pin_dst		info[AUFS_MVDOWN_LOWER].pin
+
+#define AU_MVD_PR(flag, ...) do {			\
+		if (flag)				\
+			pr_err(__VA_ARGS__);		\
+	} while (0)
+
+static int find_lower_writable(struct au_mvd_args *a)
+{
+	struct super_block *sb;
+	aufs_bindex_t bindex, bend;
+	struct au_branch *br;
+
+	sb = a->sb;
+	bindex = a->mvd_bsrc;
+	bend = au_sbend(sb);
+	if (a->mvdown.flags & AUFS_MVDOWN_FHSM_LOWER)
+		for (bindex++; bindex <= bend; bindex++) {
+			br = au_sbr(sb, bindex);
+			if (au_br_fhsm(br->br_perm)
+			    && (!(au_br_sb(br)->s_flags & MS_RDONLY)))
+				return bindex;
+		}
+	else if (!(a->mvdown.flags & AUFS_MVDOWN_ROLOWER))
+		for (bindex++; bindex <= bend; bindex++) {
+			br = au_sbr(sb, bindex);
+			if (!au_br_rdonly(br))
+				return bindex;
+		}
+	else
+		for (bindex++; bindex <= bend; bindex++) {
+			br = au_sbr(sb, bindex);
+			if (!(au_br_sb(br)->s_flags & MS_RDONLY)) {
+				if (au_br_rdonly(br))
+					a->mvdown.flags
+						|= AUFS_MVDOWN_ROLOWER_R;
+				return bindex;
+			}
+		}
+
+	return -1;
+}
+
+/* make the parent dir on bdst */
+static int au_do_mkdir(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+
+	err = 0;
+	a->mvd_hdir_src = au_hi(a->dir, a->mvd_bsrc);
+	a->mvd_hdir_dst = au_hi(a->dir, a->mvd_bdst);
+	a->mvd_h_src_parent = au_h_dptr(a->parent, a->mvd_bsrc);
+	a->mvd_h_dst_parent = NULL;
+	if (au_dbend(a->parent) >= a->mvd_bdst)
+		a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst);
+	if (!a->mvd_h_dst_parent) {
+		err = au_cpdown_dirs(a->dentry, a->mvd_bdst);
+		if (unlikely(err)) {
+			AU_MVD_PR(dmsg, "cpdown_dirs failed\n");
+			goto out;
+		}
+		a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst);
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* lock them all */
+static int au_do_lock(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct dentry *h_trap;
+
+	a->mvd_h_src_sb = au_sbr_sb(a->sb, a->mvd_bsrc);
+	a->mvd_h_dst_sb = au_sbr_sb(a->sb, a->mvd_bdst);
+	err = au_pin(&a->mvd_pin_dst, a->dentry, a->mvd_bdst,
+		     au_opt_udba(a->sb),
+		     AuPin_MNT_WRITE | AuPin_DI_LOCKED);
+	AuTraceErr(err);
+	if (unlikely(err)) {
+		AU_MVD_PR(dmsg, "pin_dst failed\n");
+		goto out;
+	}
+
+	if (a->mvd_h_src_sb != a->mvd_h_dst_sb) {
+		a->rename_lock = 0;
+		au_pin_init(&a->mvd_pin_src, a->dentry, a->mvd_bsrc,
+			    AuLsc_DI_PARENT, AuLsc_I_PARENT3,
+			    au_opt_udba(a->sb),
+			    AuPin_MNT_WRITE | AuPin_DI_LOCKED);
+		err = au_do_pin(&a->mvd_pin_src);
+		AuTraceErr(err);
+		a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent);
+		if (unlikely(err)) {
+			AU_MVD_PR(dmsg, "pin_src failed\n");
+			goto out_dst;
+		}
+		goto out; /* success */
+	}
+
+	a->rename_lock = 1;
+	au_pin_hdir_unlock(&a->mvd_pin_dst);
+	err = au_pin(&a->mvd_pin_src, a->dentry, a->mvd_bsrc,
+		     au_opt_udba(a->sb),
+		     AuPin_MNT_WRITE | AuPin_DI_LOCKED);
+	AuTraceErr(err);
+	a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent);
+	if (unlikely(err)) {
+		AU_MVD_PR(dmsg, "pin_src failed\n");
+		au_pin_hdir_lock(&a->mvd_pin_dst);
+		goto out_dst;
+	}
+	au_pin_hdir_unlock(&a->mvd_pin_src);
+	h_trap = vfsub_lock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
+				   a->mvd_h_dst_parent, a->mvd_hdir_dst);
+	if (h_trap) {
+		err = (h_trap != a->mvd_h_src_parent);
+		if (err)
+			err = (h_trap != a->mvd_h_dst_parent);
+	}
+	BUG_ON(err); /* it should never happen */
+	if (unlikely(a->mvd_h_src_dir != au_pinned_h_dir(&a->mvd_pin_src))) {
+		err = -EBUSY;
+		AuTraceErr(err);
+		vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
+				    a->mvd_h_dst_parent, a->mvd_hdir_dst);
+		au_pin_hdir_lock(&a->mvd_pin_src);
+		au_unpin(&a->mvd_pin_src);
+		au_pin_hdir_lock(&a->mvd_pin_dst);
+		goto out_dst;
+	}
+	goto out; /* success */
+
+out_dst:
+	au_unpin(&a->mvd_pin_dst);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static void au_do_unlock(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	if (!a->rename_lock)
+		au_unpin(&a->mvd_pin_src);
+	else {
+		vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
+				    a->mvd_h_dst_parent, a->mvd_hdir_dst);
+		au_pin_hdir_lock(&a->mvd_pin_src);
+		au_unpin(&a->mvd_pin_src);
+		au_pin_hdir_lock(&a->mvd_pin_dst);
+	}
+	au_unpin(&a->mvd_pin_dst);
+}
+
+/* copy-down the file */
+static int au_do_cpdown(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct au_cp_generic cpg = {
+		.dentry	= a->dentry,
+		.bdst	= a->mvd_bdst,
+		.bsrc	= a->mvd_bsrc,
+		.len	= -1,
+		.pin	= &a->mvd_pin_dst,
+		.flags	= AuCpup_DTIME | AuCpup_HOPEN
+	};
+
+	AuDbg("b%d, b%d\n", cpg.bsrc, cpg.bdst);
+	if (a->mvdown.flags & AUFS_MVDOWN_OWLOWER)
+		au_fset_cpup(cpg.flags, OVERWRITE);
+	if (a->mvdown.flags & AUFS_MVDOWN_ROLOWER)
+		au_fset_cpup(cpg.flags, RWDST);
+	err = au_sio_cpdown_simple(&cpg);
+	if (unlikely(err))
+		AU_MVD_PR(dmsg, "cpdown failed\n");
+
+	AuTraceErr(err);
+	return err;
+}
+
+/*
+ * unlink the whiteout on bdst if exist which may be created by UDBA while we
+ * were sleeping
+ */
+static int au_do_unlink_wh(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct path h_path;
+	struct au_branch *br;
+	struct inode *delegated;
+
+	br = au_sbr(a->sb, a->mvd_bdst);
+	h_path.dentry = au_wh_lkup(a->mvd_h_dst_parent, &a->dentry->d_name, br);
+	err = PTR_ERR(h_path.dentry);
+	if (IS_ERR(h_path.dentry)) {
+		AU_MVD_PR(dmsg, "wh_lkup failed\n");
+		goto out;
+	}
+
+	err = 0;
+	if (d_is_positive(h_path.dentry)) {
+		h_path.mnt = au_br_mnt(br);
+		delegated = NULL;
+		err = vfsub_unlink(d_inode(a->mvd_h_dst_parent), &h_path,
+				   &delegated, /*force*/0);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal unlink\n");
+			iput(delegated);
+		}
+		if (unlikely(err))
+			AU_MVD_PR(dmsg, "wh_unlink failed\n");
+	}
+	dput(h_path.dentry);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/*
+ * unlink the topmost h_dentry
+ */
+static int au_do_unlink(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct path h_path;
+	struct inode *delegated;
+
+	h_path.mnt = au_sbr_mnt(a->sb, a->mvd_bsrc);
+	h_path.dentry = au_h_dptr(a->dentry, a->mvd_bsrc);
+	delegated = NULL;
+	err = vfsub_unlink(a->mvd_h_src_dir, &h_path, &delegated, /*force*/0);
+	if (unlikely(err == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal unlink\n");
+		iput(delegated);
+	}
+	if (unlikely(err))
+		AU_MVD_PR(dmsg, "unlink failed\n");
+
+	AuTraceErr(err);
+	return err;
+}
+
+/* Since mvdown succeeded, we ignore an error of this function */
+static void au_do_stfs(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct au_branch *br;
+
+	a->mvdown.flags |= AUFS_MVDOWN_STFS_FAILED;
+	br = au_sbr(a->sb, a->mvd_bsrc);
+	err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_UPPER].stfs);
+	if (!err) {
+		br = au_sbr(a->sb, a->mvd_bdst);
+		a->mvdown.stbr[AUFS_MVDOWN_LOWER].brid = br->br_id;
+		err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_LOWER].stfs);
+	}
+	if (!err)
+		a->mvdown.flags &= ~AUFS_MVDOWN_STFS_FAILED;
+	else
+		AU_MVD_PR(dmsg, "statfs failed (%d), ignored\n", err);
+}
+
+/*
+ * copy-down the file and unlink the bsrc file.
+ * - unlink the bdst whout if exist
+ * - copy-down the file (with whtmp name and rename)
+ * - unlink the bsrc file
+ */
+static int au_do_mvdown(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+
+	err = au_do_mkdir(dmsg, a);
+	if (!err)
+		err = au_do_lock(dmsg, a);
+	if (unlikely(err))
+		goto out;
+
+	/*
+	 * do not revert the activities we made on bdst since they should be
+	 * harmless in aufs.
+	 */
+
+	err = au_do_cpdown(dmsg, a);
+	if (!err)
+		err = au_do_unlink_wh(dmsg, a);
+	if (!err && !(a->mvdown.flags & AUFS_MVDOWN_KUPPER))
+		err = au_do_unlink(dmsg, a);
+	if (unlikely(err))
+		goto out_unlock;
+
+	AuDbg("%pd2, 0x%x, %d --> %d\n",
+	      a->dentry, a->mvdown.flags, a->mvd_bsrc, a->mvd_bdst);
+	if (find_lower_writable(a) < 0)
+		a->mvdown.flags |= AUFS_MVDOWN_BOTTOM;
+
+	if (a->mvdown.flags & AUFS_MVDOWN_STFS)
+		au_do_stfs(dmsg, a);
+
+	/* maintain internal array */
+	if (!(a->mvdown.flags & AUFS_MVDOWN_KUPPER)) {
+		au_set_h_dptr(a->dentry, a->mvd_bsrc, NULL);
+		au_set_dbstart(a->dentry, a->mvd_bdst);
+		au_set_h_iptr(a->inode, a->mvd_bsrc, NULL, /*flags*/0);
+		au_set_ibstart(a->inode, a->mvd_bdst);
+	} else {
+		/* hide the lower */
+		au_set_h_dptr(a->dentry, a->mvd_bdst, NULL);
+		au_set_dbend(a->dentry, a->mvd_bsrc);
+		au_set_h_iptr(a->inode, a->mvd_bdst, NULL, /*flags*/0);
+		au_set_ibend(a->inode, a->mvd_bsrc);
+	}
+	if (au_dbend(a->dentry) < a->mvd_bdst)
+		au_set_dbend(a->dentry, a->mvd_bdst);
+	if (au_ibend(a->inode) < a->mvd_bdst)
+		au_set_ibend(a->inode, a->mvd_bdst);
+
+out_unlock:
+	au_do_unlock(dmsg, a);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* make sure the file is idle */
+static int au_mvd_args_busy(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err, plinked;
+
+	err = 0;
+	plinked = !!au_opt_test(au_mntflags(a->sb), PLINK);
+	if (au_dbstart(a->dentry) == a->mvd_bsrc
+	    && au_dcount(a->dentry) == 1
+	    && atomic_read(&a->inode->i_count) == 1
+	    /* && a->mvd_h_src_inode->i_nlink == 1 */
+	    && (!plinked || !au_plink_test(a->inode))
+	    && a->inode->i_nlink == 1)
+		goto out;
+
+	err = -EBUSY;
+	AU_MVD_PR(dmsg,
+		  "b%d, d{b%d, c%d?}, i{c%d?, l%u}, hi{l%u}, p{%d, %d}\n",
+		  a->mvd_bsrc, au_dbstart(a->dentry), au_dcount(a->dentry),
+		  atomic_read(&a->inode->i_count), a->inode->i_nlink,
+		  a->mvd_h_src_inode->i_nlink,
+		  plinked, plinked ? au_plink_test(a->inode) : 0);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* make sure the parent dir is fine */
+static int au_mvd_args_parent(const unsigned char dmsg,
+			      struct au_mvd_args *a)
+{
+	int err;
+	aufs_bindex_t bindex;
+
+	err = 0;
+	if (unlikely(au_alive_dir(a->parent))) {
+		err = -ENOENT;
+		AU_MVD_PR(dmsg, "parent dir is dead\n");
+		goto out;
+	}
+
+	a->bopq = au_dbdiropq(a->parent);
+	bindex = au_wbr_nonopq(a->dentry, a->mvd_bdst);
+	AuDbg("b%d\n", bindex);
+	if (unlikely((bindex >= 0 && bindex < a->mvd_bdst)
+		     || (a->bopq != -1 && a->bopq < a->mvd_bdst))) {
+		err = -EINVAL;
+		a->mvd_errno = EAU_MVDOWN_OPAQUE;
+		AU_MVD_PR(dmsg, "ancestor is opaque b%d, b%d\n",
+			  a->bopq, a->mvd_bdst);
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_mvd_args_intermediate(const unsigned char dmsg,
+				    struct au_mvd_args *a)
+{
+	int err;
+	struct au_dinfo *dinfo, *tmp;
+
+	/* lookup the next lower positive entry */
+	err = -ENOMEM;
+	tmp = au_di_alloc(a->sb, AuLsc_DI_TMP);
+	if (unlikely(!tmp))
+		goto out;
+
+	a->bfound = -1;
+	a->bwh = -1;
+	dinfo = au_di(a->dentry);
+	au_di_cp(tmp, dinfo);
+	au_di_swap(tmp, dinfo);
+
+	/* returns the number of positive dentries */
+	err = au_lkup_dentry(a->dentry, a->mvd_bsrc + 1, /*type*/0);
+	if (!err)
+		a->bwh = au_dbwh(a->dentry);
+	else if (err > 0)
+		a->bfound = au_dbstart(a->dentry);
+
+	au_di_swap(tmp, dinfo);
+	au_rw_write_unlock(&tmp->di_rwsem);
+	au_di_free(tmp);
+	if (unlikely(err < 0))
+		AU_MVD_PR(dmsg, "failed look-up lower\n");
+
+	/*
+	 * here, we have these cases.
+	 * bfound == -1
+	 *	no positive dentry under bsrc. there are more sub-cases.
+	 *	bwh < 0
+	 *		there no whiteout, we can safely move-down.
+	 *	bwh <= bsrc
+	 *		impossible
+	 *	bsrc < bwh && bwh < bdst
+	 *		there is a whiteout on RO branch. cannot proceed.
+	 *	bwh == bdst
+	 *		there is a whiteout on the RW target branch. it should
+	 *		be removed.
+	 *	bdst < bwh
+	 *		there is a whiteout somewhere unrelated branch.
+	 * -1 < bfound && bfound <= bsrc
+	 *	impossible.
+	 * bfound < bdst
+	 *	found, but it is on RO branch between bsrc and bdst. cannot
+	 *	proceed.
+	 * bfound == bdst
+	 *	found, replace it if AUFS_MVDOWN_FORCE is set. otherwise return
+	 *	error.
+	 * bdst < bfound
+	 *	found, after we create the file on bdst, it will be hidden.
+	 */
+
+	AuDebugOn(a->bfound == -1
+		  && a->bwh != -1
+		  && a->bwh <= a->mvd_bsrc);
+	AuDebugOn(-1 < a->bfound
+		  && a->bfound <= a->mvd_bsrc);
+
+	err = -EINVAL;
+	if (a->bfound == -1
+	    && a->mvd_bsrc < a->bwh
+	    && a->bwh != -1
+	    && a->bwh < a->mvd_bdst) {
+		a->mvd_errno = EAU_MVDOWN_WHITEOUT;
+		AU_MVD_PR(dmsg, "bsrc %d, bdst %d, bfound %d, bwh %d\n",
+			  a->mvd_bsrc, a->mvd_bdst, a->bfound, a->bwh);
+		goto out;
+	} else if (a->bfound != -1 && a->bfound < a->mvd_bdst) {
+		a->mvd_errno = EAU_MVDOWN_UPPER;
+		AU_MVD_PR(dmsg, "bdst %d, bfound %d\n",
+			  a->mvd_bdst, a->bfound);
+		goto out;
+	}
+
+	err = 0; /* success */
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_mvd_args_exist(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+
+	err = 0;
+	if (!(a->mvdown.flags & AUFS_MVDOWN_OWLOWER)
+	    && a->bfound == a->mvd_bdst)
+		err = -EEXIST;
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_mvd_args(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct au_branch *br;
+
+	err = -EISDIR;
+	if (unlikely(S_ISDIR(a->inode->i_mode)))
+		goto out;
+
+	err = -EINVAL;
+	if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_UPPER))
+		a->mvd_bsrc = au_ibstart(a->inode);
+	else {
+		a->mvd_bsrc = au_br_index(a->sb, a->mvd_src_brid);
+		if (unlikely(a->mvd_bsrc < 0
+			     || (a->mvd_bsrc < au_dbstart(a->dentry)
+				 || au_dbend(a->dentry) < a->mvd_bsrc
+				 || !au_h_dptr(a->dentry, a->mvd_bsrc))
+			     || (a->mvd_bsrc < au_ibstart(a->inode)
+				 || au_ibend(a->inode) < a->mvd_bsrc
+				 || !au_h_iptr(a->inode, a->mvd_bsrc)))) {
+			a->mvd_errno = EAU_MVDOWN_NOUPPER;
+			AU_MVD_PR(dmsg, "no upper\n");
+			goto out;
+		}
+	}
+	if (unlikely(a->mvd_bsrc == au_sbend(a->sb))) {
+		a->mvd_errno = EAU_MVDOWN_BOTTOM;
+		AU_MVD_PR(dmsg, "on the bottom\n");
+		goto out;
+	}
+	a->mvd_h_src_inode = au_h_iptr(a->inode, a->mvd_bsrc);
+	br = au_sbr(a->sb, a->mvd_bsrc);
+	err = au_br_rdonly(br);
+	if (!(a->mvdown.flags & AUFS_MVDOWN_ROUPPER)) {
+		if (unlikely(err))
+			goto out;
+	} else if (!(vfsub_native_ro(a->mvd_h_src_inode)
+		     || IS_APPEND(a->mvd_h_src_inode))) {
+		if (err)
+			a->mvdown.flags |= AUFS_MVDOWN_ROUPPER_R;
+		/* go on */
+	} else
+		goto out;
+
+	err = -EINVAL;
+	if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_LOWER)) {
+		a->mvd_bdst = find_lower_writable(a);
+		if (unlikely(a->mvd_bdst < 0)) {
+			a->mvd_errno = EAU_MVDOWN_BOTTOM;
+			AU_MVD_PR(dmsg, "no writable lower branch\n");
+			goto out;
+		}
+	} else {
+		a->mvd_bdst = au_br_index(a->sb, a->mvd_dst_brid);
+		if (unlikely(a->mvd_bdst < 0
+			     || au_sbend(a->sb) < a->mvd_bdst)) {
+			a->mvd_errno = EAU_MVDOWN_NOLOWERBR;
+			AU_MVD_PR(dmsg, "no lower brid\n");
+			goto out;
+		}
+	}
+
+	err = au_mvd_args_busy(dmsg, a);
+	if (!err)
+		err = au_mvd_args_parent(dmsg, a);
+	if (!err)
+		err = au_mvd_args_intermediate(dmsg, a);
+	if (!err)
+		err = au_mvd_args_exist(dmsg, a);
+	if (!err)
+		AuDbg("b%d, b%d\n", a->mvd_bsrc, a->mvd_bdst);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *uarg)
+{
+	int err, e;
+	unsigned char dmsg;
+	struct au_mvd_args *args;
+	struct inode *inode;
+
+	inode = d_inode(dentry);
+	err = -EPERM;
+	if (unlikely(!capable(CAP_SYS_ADMIN)))
+		goto out;
+
+	err = -ENOMEM;
+	args = kmalloc(sizeof(*args), GFP_NOFS);
+	if (unlikely(!args))
+		goto out;
+
+	err = copy_from_user(&args->mvdown, uarg, sizeof(args->mvdown));
+	if (!err)
+		err = !access_ok(VERIFY_WRITE, uarg, sizeof(*uarg));
+	if (unlikely(err)) {
+		err = -EFAULT;
+		AuTraceErr(err);
+		goto out_free;
+	}
+	AuDbg("flags 0x%x\n", args->mvdown.flags);
+	args->mvdown.flags &= ~(AUFS_MVDOWN_ROLOWER_R | AUFS_MVDOWN_ROUPPER_R);
+	args->mvdown.au_errno = 0;
+	args->dentry = dentry;
+	args->inode = inode;
+	args->sb = dentry->d_sb;
+
+	err = -ENOENT;
+	dmsg = !!(args->mvdown.flags & AUFS_MVDOWN_DMSG);
+	args->parent = dget_parent(dentry);
+	args->dir = d_inode(args->parent);
+	inode_lock_nested(args->dir, I_MUTEX_PARENT);
+	dput(args->parent);
+	if (unlikely(args->parent != dentry->d_parent)) {
+		AU_MVD_PR(dmsg, "parent dir is moved\n");
+		goto out_dir;
+	}
+
+	inode_lock_nested(inode, I_MUTEX_CHILD);
+	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_NOPLMW);
+	if (unlikely(err))
+		goto out_inode;
+
+	di_write_lock_parent(args->parent);
+	err = au_mvd_args(dmsg, args);
+	if (unlikely(err))
+		goto out_parent;
+
+	err = au_do_mvdown(dmsg, args);
+	if (unlikely(err))
+		goto out_parent;
+
+	au_cpup_attr_timesizes(args->dir);
+	au_cpup_attr_timesizes(inode);
+	if (!(args->mvdown.flags & AUFS_MVDOWN_KUPPER))
+		au_cpup_igen(inode, au_h_iptr(inode, args->mvd_bdst));
+	/* au_digen_dec(dentry); */
+
+out_parent:
+	di_write_unlock(args->parent);
+	aufs_read_unlock(dentry, AuLock_DW);
+out_inode:
+	inode_unlock(inode);
+out_dir:
+	inode_unlock(args->dir);
+out_free:
+	e = copy_to_user(uarg, &args->mvdown, sizeof(args->mvdown));
+	if (unlikely(e))
+		err = -EFAULT;
+	kfree(args);
+out:
+	AuTraceErr(err);
+	return err;
+}
diff --git b/fs/aufs/opts.c b/fs/aufs/opts.c
new file mode 100644
index 0000000..fc90b65
--- /dev/null
+++ b/fs/aufs/opts.c
@@ -0,0 +1,1846 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * mount options/flags
+ */
+
+#include <linux/namei.h>
+#include <linux/types.h> /* a distribution requires */
+#include <linux/parser.h>
+#include "aufs.h"
+
+/* ---------------------------------------------------------------------- */
+
+enum {
+	Opt_br,
+	Opt_add, Opt_del, Opt_mod, Opt_append, Opt_prepend,
+	Opt_idel, Opt_imod,
+	Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash,
+	Opt_rdblk_def, Opt_rdhash_def,
+	Opt_xino, Opt_noxino,
+	Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino,
+	Opt_trunc_xino_path, Opt_itrunc_xino,
+	Opt_trunc_xib, Opt_notrunc_xib,
+	Opt_shwh, Opt_noshwh,
+	Opt_plink, Opt_noplink, Opt_list_plink,
+	Opt_udba,
+	Opt_dio, Opt_nodio,
+	Opt_diropq_a, Opt_diropq_w,
+	Opt_warn_perm, Opt_nowarn_perm,
+	Opt_wbr_copyup, Opt_wbr_create,
+	Opt_fhsm_sec,
+	Opt_verbose, Opt_noverbose,
+	Opt_sum, Opt_nosum, Opt_wsum,
+	Opt_dirperm1, Opt_nodirperm1,
+	Opt_acl, Opt_noacl,
+	Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err
+};
+
+static match_table_t options = {
+	{Opt_br, "br=%s"},
+	{Opt_br, "br:%s"},
+
+	{Opt_add, "add=%d:%s"},
+	{Opt_add, "add:%d:%s"},
+	{Opt_add, "ins=%d:%s"},
+	{Opt_add, "ins:%d:%s"},
+	{Opt_append, "append=%s"},
+	{Opt_append, "append:%s"},
+	{Opt_prepend, "prepend=%s"},
+	{Opt_prepend, "prepend:%s"},
+
+	{Opt_del, "del=%s"},
+	{Opt_del, "del:%s"},
+	/* {Opt_idel, "idel:%d"}, */
+	{Opt_mod, "mod=%s"},
+	{Opt_mod, "mod:%s"},
+	/* {Opt_imod, "imod:%d:%s"}, */
+
+	{Opt_dirwh, "dirwh=%d"},
+
+	{Opt_xino, "xino=%s"},
+	{Opt_noxino, "noxino"},
+	{Opt_trunc_xino, "trunc_xino"},
+	{Opt_trunc_xino_v, "trunc_xino_v=%d:%d"},
+	{Opt_notrunc_xino, "notrunc_xino"},
+	{Opt_trunc_xino_path, "trunc_xino=%s"},
+	{Opt_itrunc_xino, "itrunc_xino=%d"},
+	/* {Opt_zxino, "zxino=%s"}, */
+	{Opt_trunc_xib, "trunc_xib"},
+	{Opt_notrunc_xib, "notrunc_xib"},
+
+#ifdef CONFIG_PROC_FS
+	{Opt_plink, "plink"},
+#else
+	{Opt_ignore_silent, "plink"},
+#endif
+
+	{Opt_noplink, "noplink"},
+
+#ifdef CONFIG_AUFS_DEBUG
+	{Opt_list_plink, "list_plink"},
+#endif
+
+	{Opt_udba, "udba=%s"},
+
+	{Opt_dio, "dio"},
+	{Opt_nodio, "nodio"},
+
+#ifdef CONFIG_AUFS_FHSM
+	{Opt_fhsm_sec, "fhsm_sec=%d"},
+#else
+	{Opt_ignore_silent, "fhsm_sec=%d"},
+#endif
+
+	{Opt_diropq_a, "diropq=always"},
+	{Opt_diropq_a, "diropq=a"},
+	{Opt_diropq_w, "diropq=whiteouted"},
+	{Opt_diropq_w, "diropq=w"},
+
+	{Opt_warn_perm, "warn_perm"},
+	{Opt_nowarn_perm, "nowarn_perm"},
+
+	/* keep them temporary */
+	{Opt_ignore_silent, "nodlgt"},
+	{Opt_ignore_silent, "clean_plink"},
+
+#ifdef CONFIG_AUFS_SHWH
+	{Opt_shwh, "shwh"},
+#endif
+	{Opt_noshwh, "noshwh"},
+
+	{Opt_dirperm1, "dirperm1"},
+	{Opt_nodirperm1, "nodirperm1"},
+
+	{Opt_verbose, "verbose"},
+	{Opt_verbose, "v"},
+	{Opt_noverbose, "noverbose"},
+	{Opt_noverbose, "quiet"},
+	{Opt_noverbose, "q"},
+	{Opt_noverbose, "silent"},
+
+	{Opt_sum, "sum"},
+	{Opt_nosum, "nosum"},
+	{Opt_wsum, "wsum"},
+
+	{Opt_rdcache, "rdcache=%d"},
+	{Opt_rdblk, "rdblk=%d"},
+	{Opt_rdblk_def, "rdblk=def"},
+	{Opt_rdhash, "rdhash=%d"},
+	{Opt_rdhash_def, "rdhash=def"},
+
+	{Opt_wbr_create, "create=%s"},
+	{Opt_wbr_create, "create_policy=%s"},
+	{Opt_wbr_copyup, "cpup=%s"},
+	{Opt_wbr_copyup, "copyup=%s"},
+	{Opt_wbr_copyup, "copyup_policy=%s"},
+
+	/* generic VFS flag */
+#ifdef CONFIG_FS_POSIX_ACL
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+#else
+	{Opt_ignore_silent, "acl"},
+	{Opt_ignore_silent, "noacl"},
+#endif
+
+	/* internal use for the scripts */
+	{Opt_ignore_silent, "si=%s"},
+
+	{Opt_br, "dirs=%s"},
+	{Opt_ignore, "debug=%d"},
+	{Opt_ignore, "delete=whiteout"},
+	{Opt_ignore, "delete=all"},
+	{Opt_ignore, "imap=%s"},
+
+	/* temporary workaround, due to old mount(8)? */
+	{Opt_ignore_silent, "relatime"},
+
+	{Opt_err, NULL}
+};
+
+/* ---------------------------------------------------------------------- */
+
+static const char *au_parser_pattern(int val, match_table_t tbl)
+{
+	struct match_token *p;
+
+	p = tbl;
+	while (p->pattern) {
+		if (p->token == val)
+			return p->pattern;
+		p++;
+	}
+	BUG();
+	return "??";
+}
+
+static const char *au_optstr(int *val, match_table_t tbl)
+{
+	struct match_token *p;
+	int v;
+
+	v = *val;
+	if (!v)
+		goto out;
+	p = tbl;
+	while (p->pattern) {
+		if (p->token
+		    && (v & p->token) == p->token) {
+			*val &= ~p->token;
+			return p->pattern;
+		}
+		p++;
+	}
+
+out:
+	return NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static match_table_t brperm = {
+	{AuBrPerm_RO, AUFS_BRPERM_RO},
+	{AuBrPerm_RR, AUFS_BRPERM_RR},
+	{AuBrPerm_RW, AUFS_BRPERM_RW},
+	{0, NULL}
+};
+
+static match_table_t brattr = {
+	/* general */
+	{AuBrAttr_COO_REG, AUFS_BRATTR_COO_REG},
+	{AuBrAttr_COO_ALL, AUFS_BRATTR_COO_ALL},
+	/* 'unpin' attrib is meaningless since linux-3.18-rc1 */
+	{AuBrAttr_UNPIN, AUFS_BRATTR_UNPIN},
+#ifdef CONFIG_AUFS_FHSM
+	{AuBrAttr_FHSM, AUFS_BRATTR_FHSM},
+#endif
+#ifdef CONFIG_AUFS_XATTR
+	{AuBrAttr_ICEX, AUFS_BRATTR_ICEX},
+	{AuBrAttr_ICEX_SEC, AUFS_BRATTR_ICEX_SEC},
+	{AuBrAttr_ICEX_SYS, AUFS_BRATTR_ICEX_SYS},
+	{AuBrAttr_ICEX_TR, AUFS_BRATTR_ICEX_TR},
+	{AuBrAttr_ICEX_USR, AUFS_BRATTR_ICEX_USR},
+	{AuBrAttr_ICEX_OTH, AUFS_BRATTR_ICEX_OTH},
+#endif
+
+	/* ro/rr branch */
+	{AuBrRAttr_WH, AUFS_BRRATTR_WH},
+
+	/* rw branch */
+	{AuBrWAttr_MOO, AUFS_BRWATTR_MOO},
+	{AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH},
+
+	{0, NULL}
+};
+
+static int br_attr_val(char *str, match_table_t table, substring_t args[])
+{
+	int attr, v;
+	char *p;
+
+	attr = 0;
+	do {
+		p = strchr(str, '+');
+		if (p)
+			*p = 0;
+		v = match_token(str, table, args);
+		if (v) {
+			if (v & AuBrAttr_CMOO_Mask)
+				attr &= ~AuBrAttr_CMOO_Mask;
+			attr |= v;
+		} else {
+			if (p)
+				*p = '+';
+			pr_warn("ignored branch attribute %s\n", str);
+			break;
+		}
+		if (p)
+			str = p + 1;
+	} while (p);
+
+	return attr;
+}
+
+static int au_do_optstr_br_attr(au_br_perm_str_t *str, int perm)
+{
+	int sz;
+	const char *p;
+	char *q;
+
+	q = str->a;
+	*q = 0;
+	p = au_optstr(&perm, brattr);
+	if (p) {
+		sz = strlen(p);
+		memcpy(q, p, sz + 1);
+		q += sz;
+	} else
+		goto out;
+
+	do {
+		p = au_optstr(&perm, brattr);
+		if (p) {
+			*q++ = '+';
+			sz = strlen(p);
+			memcpy(q, p, sz + 1);
+			q += sz;
+		}
+	} while (p);
+
+out:
+	return q - str->a;
+}
+
+static int noinline_for_stack br_perm_val(char *perm)
+{
+	int val, bad, sz;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	au_br_perm_str_t attr;
+
+	p = strchr(perm, '+');
+	if (p)
+		*p = 0;
+	val = match_token(perm, brperm, args);
+	if (!val) {
+		if (p)
+			*p = '+';
+		pr_warn("ignored branch permission %s\n", perm);
+		val = AuBrPerm_RO;
+		goto out;
+	}
+	if (!p)
+		goto out;
+
+	val |= br_attr_val(p + 1, brattr, args);
+
+	bad = 0;
+	switch (val & AuBrPerm_Mask) {
+	case AuBrPerm_RO:
+	case AuBrPerm_RR:
+		bad = val & AuBrWAttr_Mask;
+		val &= ~AuBrWAttr_Mask;
+		break;
+	case AuBrPerm_RW:
+		bad = val & AuBrRAttr_Mask;
+		val &= ~AuBrRAttr_Mask;
+		break;
+	}
+
+	/*
+	 * 'unpin' attrib becomes meaningless since linux-3.18-rc1, but aufs
+	 * does not treat it as an error, just warning.
+	 * this is a tiny guard for the user operation.
+	 */
+	if (val & AuBrAttr_UNPIN) {
+		bad |= AuBrAttr_UNPIN;
+		val &= ~AuBrAttr_UNPIN;
+	}
+
+	if (unlikely(bad)) {
+		sz = au_do_optstr_br_attr(&attr, bad);
+		AuDebugOn(!sz);
+		pr_warn("ignored branch attribute %s\n", attr.a);
+	}
+
+out:
+	return val;
+}
+
+void au_optstr_br_perm(au_br_perm_str_t *str, int perm)
+{
+	au_br_perm_str_t attr;
+	const char *p;
+	char *q;
+	int sz;
+
+	q = str->a;
+	p = au_optstr(&perm, brperm);
+	AuDebugOn(!p || !*p);
+	sz = strlen(p);
+	memcpy(q, p, sz + 1);
+	q += sz;
+
+	sz = au_do_optstr_br_attr(&attr, perm);
+	if (sz) {
+		*q++ = '+';
+		memcpy(q, attr.a, sz + 1);
+	}
+
+	AuDebugOn(strlen(str->a) >= sizeof(str->a));
+}
+
+/* ---------------------------------------------------------------------- */
+
+static match_table_t udbalevel = {
+	{AuOpt_UDBA_REVAL, "reval"},
+	{AuOpt_UDBA_NONE, "none"},
+#ifdef CONFIG_AUFS_HNOTIFY
+	{AuOpt_UDBA_HNOTIFY, "notify"}, /* abstraction */
+#ifdef CONFIG_AUFS_HFSNOTIFY
+	{AuOpt_UDBA_HNOTIFY, "fsnotify"},
+#endif
+#endif
+	{-1, NULL}
+};
+
+static int noinline_for_stack udba_val(char *str)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	return match_token(str, udbalevel, args);
+}
+
+const char *au_optstr_udba(int udba)
+{
+	return au_parser_pattern(udba, udbalevel);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static match_table_t au_wbr_create_policy = {
+	{AuWbrCreate_TDP, "tdp"},
+	{AuWbrCreate_TDP, "top-down-parent"},
+	{AuWbrCreate_RR, "rr"},
+	{AuWbrCreate_RR, "round-robin"},
+	{AuWbrCreate_MFS, "mfs"},
+	{AuWbrCreate_MFS, "most-free-space"},
+	{AuWbrCreate_MFSV, "mfs:%d"},
+	{AuWbrCreate_MFSV, "most-free-space:%d"},
+
+	{AuWbrCreate_MFSRR, "mfsrr:%d"},
+	{AuWbrCreate_MFSRRV, "mfsrr:%d:%d"},
+	{AuWbrCreate_PMFS, "pmfs"},
+	{AuWbrCreate_PMFSV, "pmfs:%d"},
+	{AuWbrCreate_PMFSRR, "pmfsrr:%d"},
+	{AuWbrCreate_PMFSRRV, "pmfsrr:%d:%d"},
+
+	{-1, NULL}
+};
+
+/*
+ * cf. linux/lib/parser.c and cmdline.c
+ * gave up calling memparse() since it uses simple_strtoull() instead of
+ * kstrto...().
+ */
+static int noinline_for_stack
+au_match_ull(substring_t *s, unsigned long long *result)
+{
+	int err;
+	unsigned int len;
+	char a[32];
+
+	err = -ERANGE;
+	len = s->to - s->from;
+	if (len + 1 <= sizeof(a)) {
+		memcpy(a, s->from, len);
+		a[len] = '\0';
+		err = kstrtoull(a, 0, result);
+	}
+	return err;
+}
+
+static int au_wbr_mfs_wmark(substring_t *arg, char *str,
+			    struct au_opt_wbr_create *create)
+{
+	int err;
+	unsigned long long ull;
+
+	err = 0;
+	if (!au_match_ull(arg, &ull))
+		create->mfsrr_watermark = ull;
+	else {
+		pr_err("bad integer in %s\n", str);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int au_wbr_mfs_sec(substring_t *arg, char *str,
+			  struct au_opt_wbr_create *create)
+{
+	int n, err;
+
+	err = 0;
+	if (!match_int(arg, &n) && 0 <= n && n <= AUFS_MFS_MAX_SEC)
+		create->mfs_second = n;
+	else {
+		pr_err("bad integer in %s\n", str);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int noinline_for_stack
+au_wbr_create_val(char *str, struct au_opt_wbr_create *create)
+{
+	int err, e;
+	substring_t args[MAX_OPT_ARGS];
+
+	err = match_token(str, au_wbr_create_policy, args);
+	create->wbr_create = err;
+	switch (err) {
+	case AuWbrCreate_MFSRRV:
+	case AuWbrCreate_PMFSRRV:
+		e = au_wbr_mfs_wmark(&args[0], str, create);
+		if (!e)
+			e = au_wbr_mfs_sec(&args[1], str, create);
+		if (unlikely(e))
+			err = e;
+		break;
+	case AuWbrCreate_MFSRR:
+	case AuWbrCreate_PMFSRR:
+		e = au_wbr_mfs_wmark(&args[0], str, create);
+		if (unlikely(e)) {
+			err = e;
+			break;
+		}
+		/*FALLTHROUGH*/
+	case AuWbrCreate_MFS:
+	case AuWbrCreate_PMFS:
+		create->mfs_second = AUFS_MFS_DEF_SEC;
+		break;
+	case AuWbrCreate_MFSV:
+	case AuWbrCreate_PMFSV:
+		e = au_wbr_mfs_sec(&args[0], str, create);
+		if (unlikely(e))
+			err = e;
+		break;
+	}
+
+	return err;
+}
+
+const char *au_optstr_wbr_create(int wbr_create)
+{
+	return au_parser_pattern(wbr_create, au_wbr_create_policy);
+}
+
+static match_table_t au_wbr_copyup_policy = {
+	{AuWbrCopyup_TDP, "tdp"},
+	{AuWbrCopyup_TDP, "top-down-parent"},
+	{AuWbrCopyup_BUP, "bup"},
+	{AuWbrCopyup_BUP, "bottom-up-parent"},
+	{AuWbrCopyup_BU, "bu"},
+	{AuWbrCopyup_BU, "bottom-up"},
+	{-1, NULL}
+};
+
+static int noinline_for_stack au_wbr_copyup_val(char *str)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	return match_token(str, au_wbr_copyup_policy, args);
+}
+
+const char *au_optstr_wbr_copyup(int wbr_copyup)
+{
+	return au_parser_pattern(wbr_copyup, au_wbr_copyup_policy);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+static void dump_opts(struct au_opts *opts)
+{
+#ifdef CONFIG_AUFS_DEBUG
+	/* reduce stack space */
+	union {
+		struct au_opt_add *add;
+		struct au_opt_del *del;
+		struct au_opt_mod *mod;
+		struct au_opt_xino *xino;
+		struct au_opt_xino_itrunc *xino_itrunc;
+		struct au_opt_wbr_create *create;
+	} u;
+	struct au_opt *opt;
+
+	opt = opts->opt;
+	while (opt->type != Opt_tail) {
+		switch (opt->type) {
+		case Opt_add:
+			u.add = &opt->add;
+			AuDbg("add {b%d, %s, 0x%x, %p}\n",
+				  u.add->bindex, u.add->pathname, u.add->perm,
+				  u.add->path.dentry);
+			break;
+		case Opt_del:
+		case Opt_idel:
+			u.del = &opt->del;
+			AuDbg("del {%s, %p}\n",
+			      u.del->pathname, u.del->h_path.dentry);
+			break;
+		case Opt_mod:
+		case Opt_imod:
+			u.mod = &opt->mod;
+			AuDbg("mod {%s, 0x%x, %p}\n",
+				  u.mod->path, u.mod->perm, u.mod->h_root);
+			break;
+		case Opt_append:
+			u.add = &opt->add;
+			AuDbg("append {b%d, %s, 0x%x, %p}\n",
+				  u.add->bindex, u.add->pathname, u.add->perm,
+				  u.add->path.dentry);
+			break;
+		case Opt_prepend:
+			u.add = &opt->add;
+			AuDbg("prepend {b%d, %s, 0x%x, %p}\n",
+				  u.add->bindex, u.add->pathname, u.add->perm,
+				  u.add->path.dentry);
+			break;
+		case Opt_dirwh:
+			AuDbg("dirwh %d\n", opt->dirwh);
+			break;
+		case Opt_rdcache:
+			AuDbg("rdcache %d\n", opt->rdcache);
+			break;
+		case Opt_rdblk:
+			AuDbg("rdblk %u\n", opt->rdblk);
+			break;
+		case Opt_rdblk_def:
+			AuDbg("rdblk_def\n");
+			break;
+		case Opt_rdhash:
+			AuDbg("rdhash %u\n", opt->rdhash);
+			break;
+		case Opt_rdhash_def:
+			AuDbg("rdhash_def\n");
+			break;
+		case Opt_xino:
+			u.xino = &opt->xino;
+			AuDbg("xino {%s %pD}\n", u.xino->path, u.xino->file);
+			break;
+		case Opt_trunc_xino:
+			AuLabel(trunc_xino);
+			break;
+		case Opt_notrunc_xino:
+			AuLabel(notrunc_xino);
+			break;
+		case Opt_trunc_xino_path:
+		case Opt_itrunc_xino:
+			u.xino_itrunc = &opt->xino_itrunc;
+			AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex);
+			break;
+		case Opt_noxino:
+			AuLabel(noxino);
+			break;
+		case Opt_trunc_xib:
+			AuLabel(trunc_xib);
+			break;
+		case Opt_notrunc_xib:
+			AuLabel(notrunc_xib);
+			break;
+		case Opt_shwh:
+			AuLabel(shwh);
+			break;
+		case Opt_noshwh:
+			AuLabel(noshwh);
+			break;
+		case Opt_dirperm1:
+			AuLabel(dirperm1);
+			break;
+		case Opt_nodirperm1:
+			AuLabel(nodirperm1);
+			break;
+		case Opt_plink:
+			AuLabel(plink);
+			break;
+		case Opt_noplink:
+			AuLabel(noplink);
+			break;
+		case Opt_list_plink:
+			AuLabel(list_plink);
+			break;
+		case Opt_udba:
+			AuDbg("udba %d, %s\n",
+				  opt->udba, au_optstr_udba(opt->udba));
+			break;
+		case Opt_dio:
+			AuLabel(dio);
+			break;
+		case Opt_nodio:
+			AuLabel(nodio);
+			break;
+		case Opt_diropq_a:
+			AuLabel(diropq_a);
+			break;
+		case Opt_diropq_w:
+			AuLabel(diropq_w);
+			break;
+		case Opt_warn_perm:
+			AuLabel(warn_perm);
+			break;
+		case Opt_nowarn_perm:
+			AuLabel(nowarn_perm);
+			break;
+		case Opt_verbose:
+			AuLabel(verbose);
+			break;
+		case Opt_noverbose:
+			AuLabel(noverbose);
+			break;
+		case Opt_sum:
+			AuLabel(sum);
+			break;
+		case Opt_nosum:
+			AuLabel(nosum);
+			break;
+		case Opt_wsum:
+			AuLabel(wsum);
+			break;
+		case Opt_wbr_create:
+			u.create = &opt->wbr_create;
+			AuDbg("create %d, %s\n", u.create->wbr_create,
+				  au_optstr_wbr_create(u.create->wbr_create));
+			switch (u.create->wbr_create) {
+			case AuWbrCreate_MFSV:
+			case AuWbrCreate_PMFSV:
+				AuDbg("%d sec\n", u.create->mfs_second);
+				break;
+			case AuWbrCreate_MFSRR:
+				AuDbg("%llu watermark\n",
+					  u.create->mfsrr_watermark);
+				break;
+			case AuWbrCreate_MFSRRV:
+			case AuWbrCreate_PMFSRRV:
+				AuDbg("%llu watermark, %d sec\n",
+					  u.create->mfsrr_watermark,
+					  u.create->mfs_second);
+				break;
+			}
+			break;
+		case Opt_wbr_copyup:
+			AuDbg("copyup %d, %s\n", opt->wbr_copyup,
+				  au_optstr_wbr_copyup(opt->wbr_copyup));
+			break;
+		case Opt_fhsm_sec:
+			AuDbg("fhsm_sec %u\n", opt->fhsm_second);
+			break;
+		case Opt_acl:
+			AuLabel(acl);
+			break;
+		case Opt_noacl:
+			AuLabel(noacl);
+			break;
+		default:
+			BUG();
+		}
+		opt++;
+	}
+#endif
+}
+
+void au_opts_free(struct au_opts *opts)
+{
+	struct au_opt *opt;
+
+	opt = opts->opt;
+	while (opt->type != Opt_tail) {
+		switch (opt->type) {
+		case Opt_add:
+		case Opt_append:
+		case Opt_prepend:
+			path_put(&opt->add.path);
+			break;
+		case Opt_del:
+		case Opt_idel:
+			path_put(&opt->del.h_path);
+			break;
+		case Opt_mod:
+		case Opt_imod:
+			dput(opt->mod.h_root);
+			break;
+		case Opt_xino:
+			fput(opt->xino.file);
+			break;
+		}
+		opt++;
+	}
+}
+
+static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags,
+		   aufs_bindex_t bindex)
+{
+	int err;
+	struct au_opt_add *add = &opt->add;
+	char *p;
+
+	add->bindex = bindex;
+	add->perm = AuBrPerm_RO;
+	add->pathname = opt_str;
+	p = strchr(opt_str, '=');
+	if (p) {
+		*p++ = 0;
+		if (*p)
+			add->perm = br_perm_val(p);
+	}
+
+	err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path);
+	if (!err) {
+		if (!p) {
+			add->perm = AuBrPerm_RO;
+			if (au_test_fs_rr(add->path.dentry->d_sb))
+				add->perm = AuBrPerm_RR;
+			else if (!bindex && !(sb_flags & MS_RDONLY))
+				add->perm = AuBrPerm_RW;
+		}
+		opt->type = Opt_add;
+		goto out;
+	}
+	pr_err("lookup failed %s (%d)\n", add->pathname, err);
+	err = -EINVAL;
+
+out:
+	return err;
+}
+
+static int au_opts_parse_del(struct au_opt_del *del, substring_t args[])
+{
+	int err;
+
+	del->pathname = args[0].from;
+	AuDbg("del path %s\n", del->pathname);
+
+	err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path);
+	if (unlikely(err))
+		pr_err("lookup failed %s (%d)\n", del->pathname, err);
+
+	return err;
+}
+
+#if 0 /* reserved for future use */
+static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex,
+			      struct au_opt_del *del, substring_t args[])
+{
+	int err;
+	struct dentry *root;
+
+	err = -EINVAL;
+	root = sb->s_root;
+	aufs_read_lock(root, AuLock_FLUSH);
+	if (bindex < 0 || au_sbend(sb) < bindex) {
+		pr_err("out of bounds, %d\n", bindex);
+		goto out;
+	}
+
+	err = 0;
+	del->h_path.dentry = dget(au_h_dptr(root, bindex));
+	del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex));
+
+out:
+	aufs_read_unlock(root, !AuLock_IR);
+	return err;
+}
+#endif
+
+static int noinline_for_stack
+au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[])
+{
+	int err;
+	struct path path;
+	char *p;
+
+	err = -EINVAL;
+	mod->path = args[0].from;
+	p = strchr(mod->path, '=');
+	if (unlikely(!p)) {
+		pr_err("no permssion %s\n", args[0].from);
+		goto out;
+	}
+
+	*p++ = 0;
+	err = vfsub_kern_path(mod->path, lkup_dirflags, &path);
+	if (unlikely(err)) {
+		pr_err("lookup failed %s (%d)\n", mod->path, err);
+		goto out;
+	}
+
+	mod->perm = br_perm_val(p);
+	AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p);
+	mod->h_root = dget(path.dentry);
+	path_put(&path);
+
+out:
+	return err;
+}
+
+#if 0 /* reserved for future use */
+static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex,
+			      struct au_opt_mod *mod, substring_t args[])
+{
+	int err;
+	struct dentry *root;
+
+	err = -EINVAL;
+	root = sb->s_root;
+	aufs_read_lock(root, AuLock_FLUSH);
+	if (bindex < 0 || au_sbend(sb) < bindex) {
+		pr_err("out of bounds, %d\n", bindex);
+		goto out;
+	}
+
+	err = 0;
+	mod->perm = br_perm_val(args[1].from);
+	AuDbg("mod path %s, perm 0x%x, %s\n",
+	      mod->path, mod->perm, args[1].from);
+	mod->h_root = dget(au_h_dptr(root, bindex));
+
+out:
+	aufs_read_unlock(root, !AuLock_IR);
+	return err;
+}
+#endif
+
+static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino,
+			      substring_t args[])
+{
+	int err;
+	struct file *file;
+
+	file = au_xino_create(sb, args[0].from, /*silent*/0);
+	err = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+
+	err = -EINVAL;
+	if (unlikely(file->f_path.dentry->d_sb == sb)) {
+		fput(file);
+		pr_err("%s must be outside\n", args[0].from);
+		goto out;
+	}
+
+	err = 0;
+	xino->file = file;
+	xino->path = args[0].from;
+
+out:
+	return err;
+}
+
+static int noinline_for_stack
+au_opts_parse_xino_itrunc_path(struct super_block *sb,
+			       struct au_opt_xino_itrunc *xino_itrunc,
+			       substring_t args[])
+{
+	int err;
+	aufs_bindex_t bend, bindex;
+	struct path path;
+	struct dentry *root;
+
+	err = vfsub_kern_path(args[0].from, lkup_dirflags, &path);
+	if (unlikely(err)) {
+		pr_err("lookup failed %s (%d)\n", args[0].from, err);
+		goto out;
+	}
+
+	xino_itrunc->bindex = -1;
+	root = sb->s_root;
+	aufs_read_lock(root, AuLock_FLUSH);
+	bend = au_sbend(sb);
+	for (bindex = 0; bindex <= bend; bindex++) {
+		if (au_h_dptr(root, bindex) == path.dentry) {
+			xino_itrunc->bindex = bindex;
+			break;
+		}
+	}
+	aufs_read_unlock(root, !AuLock_IR);
+	path_put(&path);
+
+	if (unlikely(xino_itrunc->bindex < 0)) {
+		pr_err("no such branch %s\n", args[0].from);
+		err = -EINVAL;
+	}
+
+out:
+	return err;
+}
+
+/* called without aufs lock */
+int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts)
+{
+	int err, n, token;
+	aufs_bindex_t bindex;
+	unsigned char skipped;
+	struct dentry *root;
+	struct au_opt *opt, *opt_tail;
+	char *opt_str;
+	/* reduce the stack space */
+	union {
+		struct au_opt_xino_itrunc *xino_itrunc;
+		struct au_opt_wbr_create *create;
+	} u;
+	struct {
+		substring_t args[MAX_OPT_ARGS];
+	} *a;
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	root = sb->s_root;
+	err = 0;
+	bindex = 0;
+	opt = opts->opt;
+	opt_tail = opt + opts->max_opt - 1;
+	opt->type = Opt_tail;
+	while (!err && (opt_str = strsep(&str, ",")) && *opt_str) {
+		err = -EINVAL;
+		skipped = 0;
+		token = match_token(opt_str, options, a->args);
+		switch (token) {
+		case Opt_br:
+			err = 0;
+			while (!err && (opt_str = strsep(&a->args[0].from, ":"))
+			       && *opt_str) {
+				err = opt_add(opt, opt_str, opts->sb_flags,
+					      bindex++);
+				if (unlikely(!err && ++opt > opt_tail)) {
+					err = -E2BIG;
+					break;
+				}
+				opt->type = Opt_tail;
+				skipped = 1;
+			}
+			break;
+		case Opt_add:
+			if (unlikely(match_int(&a->args[0], &n))) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			bindex = n;
+			err = opt_add(opt, a->args[1].from, opts->sb_flags,
+				      bindex);
+			if (!err)
+				opt->type = token;
+			break;
+		case Opt_append:
+			err = opt_add(opt, a->args[0].from, opts->sb_flags,
+				      /*dummy bindex*/1);
+			if (!err)
+				opt->type = token;
+			break;
+		case Opt_prepend:
+			err = opt_add(opt, a->args[0].from, opts->sb_flags,
+				      /*bindex*/0);
+			if (!err)
+				opt->type = token;
+			break;
+		case Opt_del:
+			err = au_opts_parse_del(&opt->del, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+#if 0 /* reserved for future use */
+		case Opt_idel:
+			del->pathname = "(indexed)";
+			if (unlikely(match_int(&args[0], &n))) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			err = au_opts_parse_idel(sb, n, &opt->del, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+#endif
+		case Opt_mod:
+			err = au_opts_parse_mod(&opt->mod, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+#ifdef IMOD /* reserved for future use */
+		case Opt_imod:
+			u.mod->path = "(indexed)";
+			if (unlikely(match_int(&a->args[0], &n))) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			err = au_opts_parse_imod(sb, n, &opt->mod, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+#endif
+		case Opt_xino:
+			err = au_opts_parse_xino(sb, &opt->xino, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+
+		case Opt_trunc_xino_path:
+			err = au_opts_parse_xino_itrunc_path
+				(sb, &opt->xino_itrunc, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+
+		case Opt_itrunc_xino:
+			u.xino_itrunc = &opt->xino_itrunc;
+			if (unlikely(match_int(&a->args[0], &n))) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			u.xino_itrunc->bindex = n;
+			aufs_read_lock(root, AuLock_FLUSH);
+			if (n < 0 || au_sbend(sb) < n) {
+				pr_err("out of bounds, %d\n", n);
+				aufs_read_unlock(root, !AuLock_IR);
+				break;
+			}
+			aufs_read_unlock(root, !AuLock_IR);
+			err = 0;
+			opt->type = token;
+			break;
+
+		case Opt_dirwh:
+			if (unlikely(match_int(&a->args[0], &opt->dirwh)))
+				break;
+			err = 0;
+			opt->type = token;
+			break;
+
+		case Opt_rdcache:
+			if (unlikely(match_int(&a->args[0], &n))) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			if (unlikely(n > AUFS_RDCACHE_MAX)) {
+				pr_err("rdcache must be smaller than %d\n",
+				       AUFS_RDCACHE_MAX);
+				break;
+			}
+			opt->rdcache = n;
+			err = 0;
+			opt->type = token;
+			break;
+		case Opt_rdblk:
+			if (unlikely(match_int(&a->args[0], &n)
+				     || n < 0
+				     || n > KMALLOC_MAX_SIZE)) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			if (unlikely(n && n < NAME_MAX)) {
+				pr_err("rdblk must be larger than %d\n",
+				       NAME_MAX);
+				break;
+			}
+			opt->rdblk = n;
+			err = 0;
+			opt->type = token;
+			break;
+		case Opt_rdhash:
+			if (unlikely(match_int(&a->args[0], &n)
+				     || n < 0
+				     || n * sizeof(struct hlist_head)
+				     > KMALLOC_MAX_SIZE)) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			opt->rdhash = n;
+			err = 0;
+			opt->type = token;
+			break;
+
+		case Opt_trunc_xino:
+		case Opt_notrunc_xino:
+		case Opt_noxino:
+		case Opt_trunc_xib:
+		case Opt_notrunc_xib:
+		case Opt_shwh:
+		case Opt_noshwh:
+		case Opt_dirperm1:
+		case Opt_nodirperm1:
+		case Opt_plink:
+		case Opt_noplink:
+		case Opt_list_plink:
+		case Opt_dio:
+		case Opt_nodio:
+		case Opt_diropq_a:
+		case Opt_diropq_w:
+		case Opt_warn_perm:
+		case Opt_nowarn_perm:
+		case Opt_verbose:
+		case Opt_noverbose:
+		case Opt_sum:
+		case Opt_nosum:
+		case Opt_wsum:
+		case Opt_rdblk_def:
+		case Opt_rdhash_def:
+		case Opt_acl:
+		case Opt_noacl:
+			err = 0;
+			opt->type = token;
+			break;
+
+		case Opt_udba:
+			opt->udba = udba_val(a->args[0].from);
+			if (opt->udba >= 0) {
+				err = 0;
+				opt->type = token;
+			} else
+				pr_err("wrong value, %s\n", opt_str);
+			break;
+
+		case Opt_wbr_create:
+			u.create = &opt->wbr_create;
+			u.create->wbr_create
+				= au_wbr_create_val(a->args[0].from, u.create);
+			if (u.create->wbr_create >= 0) {
+				err = 0;
+				opt->type = token;
+			} else
+				pr_err("wrong value, %s\n", opt_str);
+			break;
+		case Opt_wbr_copyup:
+			opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from);
+			if (opt->wbr_copyup >= 0) {
+				err = 0;
+				opt->type = token;
+			} else
+				pr_err("wrong value, %s\n", opt_str);
+			break;
+
+		case Opt_fhsm_sec:
+			if (unlikely(match_int(&a->args[0], &n)
+				     || n < 0)) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			if (sysaufs_brs) {
+				opt->fhsm_second = n;
+				opt->type = token;
+			} else
+				pr_warn("ignored %s\n", opt_str);
+			err = 0;
+			break;
+
+		case Opt_ignore:
+			pr_warn("ignored %s\n", opt_str);
+			/*FALLTHROUGH*/
+		case Opt_ignore_silent:
+			skipped = 1;
+			err = 0;
+			break;
+		case Opt_err:
+			pr_err("unknown option %s\n", opt_str);
+			break;
+		}
+
+		if (!err && !skipped) {
+			if (unlikely(++opt > opt_tail)) {
+				err = -E2BIG;
+				opt--;
+				opt->type = Opt_tail;
+				break;
+			}
+			opt->type = Opt_tail;
+		}
+	}
+
+	kfree(a);
+	dump_opts(opts);
+	if (unlikely(err))
+		au_opts_free(opts);
+
+out:
+	return err;
+}
+
+static int au_opt_wbr_create(struct super_block *sb,
+			     struct au_opt_wbr_create *create)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	err = 1; /* handled */
+	sbinfo = au_sbi(sb);
+	if (sbinfo->si_wbr_create_ops->fin) {
+		err = sbinfo->si_wbr_create_ops->fin(sb);
+		if (!err)
+			err = 1;
+	}
+
+	sbinfo->si_wbr_create = create->wbr_create;
+	sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create;
+	switch (create->wbr_create) {
+	case AuWbrCreate_MFSRRV:
+	case AuWbrCreate_MFSRR:
+	case AuWbrCreate_PMFSRR:
+	case AuWbrCreate_PMFSRRV:
+		sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark;
+		/*FALLTHROUGH*/
+	case AuWbrCreate_MFS:
+	case AuWbrCreate_MFSV:
+	case AuWbrCreate_PMFS:
+	case AuWbrCreate_PMFSV:
+		sbinfo->si_wbr_mfs.mfs_expire
+			= msecs_to_jiffies(create->mfs_second * MSEC_PER_SEC);
+		break;
+	}
+
+	if (sbinfo->si_wbr_create_ops->init)
+		sbinfo->si_wbr_create_ops->init(sb); /* ignore */
+
+	return err;
+}
+
+/*
+ * returns,
+ * plus: processed without an error
+ * zero: unprocessed
+ */
+static int au_opt_simple(struct super_block *sb, struct au_opt *opt,
+			 struct au_opts *opts)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	err = 1; /* handled */
+	sbinfo = au_sbi(sb);
+	switch (opt->type) {
+	case Opt_udba:
+		sbinfo->si_mntflags &= ~AuOptMask_UDBA;
+		sbinfo->si_mntflags |= opt->udba;
+		opts->given_udba |= opt->udba;
+		break;
+
+	case Opt_plink:
+		au_opt_set(sbinfo->si_mntflags, PLINK);
+		break;
+	case Opt_noplink:
+		if (au_opt_test(sbinfo->si_mntflags, PLINK))
+			au_plink_put(sb, /*verbose*/1);
+		au_opt_clr(sbinfo->si_mntflags, PLINK);
+		break;
+	case Opt_list_plink:
+		if (au_opt_test(sbinfo->si_mntflags, PLINK))
+			au_plink_list(sb);
+		break;
+
+	case Opt_dio:
+		au_opt_set(sbinfo->si_mntflags, DIO);
+		au_fset_opts(opts->flags, REFRESH_DYAOP);
+		break;
+	case Opt_nodio:
+		au_opt_clr(sbinfo->si_mntflags, DIO);
+		au_fset_opts(opts->flags, REFRESH_DYAOP);
+		break;
+
+	case Opt_fhsm_sec:
+		au_fhsm_set(sbinfo, opt->fhsm_second);
+		break;
+
+	case Opt_diropq_a:
+		au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ);
+		break;
+	case Opt_diropq_w:
+		au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ);
+		break;
+
+	case Opt_warn_perm:
+		au_opt_set(sbinfo->si_mntflags, WARN_PERM);
+		break;
+	case Opt_nowarn_perm:
+		au_opt_clr(sbinfo->si_mntflags, WARN_PERM);
+		break;
+
+	case Opt_verbose:
+		au_opt_set(sbinfo->si_mntflags, VERBOSE);
+		break;
+	case Opt_noverbose:
+		au_opt_clr(sbinfo->si_mntflags, VERBOSE);
+		break;
+
+	case Opt_sum:
+		au_opt_set(sbinfo->si_mntflags, SUM);
+		break;
+	case Opt_wsum:
+		au_opt_clr(sbinfo->si_mntflags, SUM);
+		au_opt_set(sbinfo->si_mntflags, SUM_W);
+	case Opt_nosum:
+		au_opt_clr(sbinfo->si_mntflags, SUM);
+		au_opt_clr(sbinfo->si_mntflags, SUM_W);
+		break;
+
+	case Opt_wbr_create:
+		err = au_opt_wbr_create(sb, &opt->wbr_create);
+		break;
+	case Opt_wbr_copyup:
+		sbinfo->si_wbr_copyup = opt->wbr_copyup;
+		sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup;
+		break;
+
+	case Opt_dirwh:
+		sbinfo->si_dirwh = opt->dirwh;
+		break;
+
+	case Opt_rdcache:
+		sbinfo->si_rdcache
+			= msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC);
+		break;
+	case Opt_rdblk:
+		sbinfo->si_rdblk = opt->rdblk;
+		break;
+	case Opt_rdblk_def:
+		sbinfo->si_rdblk = AUFS_RDBLK_DEF;
+		break;
+	case Opt_rdhash:
+		sbinfo->si_rdhash = opt->rdhash;
+		break;
+	case Opt_rdhash_def:
+		sbinfo->si_rdhash = AUFS_RDHASH_DEF;
+		break;
+
+	case Opt_shwh:
+		au_opt_set(sbinfo->si_mntflags, SHWH);
+		break;
+	case Opt_noshwh:
+		au_opt_clr(sbinfo->si_mntflags, SHWH);
+		break;
+
+	case Opt_dirperm1:
+		au_opt_set(sbinfo->si_mntflags, DIRPERM1);
+		break;
+	case Opt_nodirperm1:
+		au_opt_clr(sbinfo->si_mntflags, DIRPERM1);
+		break;
+
+	case Opt_trunc_xino:
+		au_opt_set(sbinfo->si_mntflags, TRUNC_XINO);
+		break;
+	case Opt_notrunc_xino:
+		au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO);
+		break;
+
+	case Opt_trunc_xino_path:
+	case Opt_itrunc_xino:
+		err = au_xino_trunc(sb, opt->xino_itrunc.bindex);
+		if (!err)
+			err = 1;
+		break;
+
+	case Opt_trunc_xib:
+		au_fset_opts(opts->flags, TRUNC_XIB);
+		break;
+	case Opt_notrunc_xib:
+		au_fclr_opts(opts->flags, TRUNC_XIB);
+		break;
+
+	case Opt_acl:
+		sb->s_flags |= MS_POSIXACL;
+		break;
+	case Opt_noacl:
+		sb->s_flags &= ~MS_POSIXACL;
+		break;
+
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * returns tri-state.
+ * plus: processed without an error
+ * zero: unprocessed
+ * minus: error
+ */
+static int au_opt_br(struct super_block *sb, struct au_opt *opt,
+		     struct au_opts *opts)
+{
+	int err, do_refresh;
+
+	err = 0;
+	switch (opt->type) {
+	case Opt_append:
+		opt->add.bindex = au_sbend(sb) + 1;
+		if (opt->add.bindex < 0)
+			opt->add.bindex = 0;
+		goto add;
+	case Opt_prepend:
+		opt->add.bindex = 0;
+	add: /* indented label */
+	case Opt_add:
+		err = au_br_add(sb, &opt->add,
+				au_ftest_opts(opts->flags, REMOUNT));
+		if (!err) {
+			err = 1;
+			au_fset_opts(opts->flags, REFRESH);
+		}
+		break;
+
+	case Opt_del:
+	case Opt_idel:
+		err = au_br_del(sb, &opt->del,
+				au_ftest_opts(opts->flags, REMOUNT));
+		if (!err) {
+			err = 1;
+			au_fset_opts(opts->flags, TRUNC_XIB);
+			au_fset_opts(opts->flags, REFRESH);
+		}
+		break;
+
+	case Opt_mod:
+	case Opt_imod:
+		err = au_br_mod(sb, &opt->mod,
+				au_ftest_opts(opts->flags, REMOUNT),
+				&do_refresh);
+		if (!err) {
+			err = 1;
+			if (do_refresh)
+				au_fset_opts(opts->flags, REFRESH);
+		}
+		break;
+	}
+
+	return err;
+}
+
+static int au_opt_xino(struct super_block *sb, struct au_opt *opt,
+		       struct au_opt_xino **opt_xino,
+		       struct au_opts *opts)
+{
+	int err;
+	aufs_bindex_t bend, bindex;
+	struct dentry *root, *parent, *h_root;
+
+	err = 0;
+	switch (opt->type) {
+	case Opt_xino:
+		err = au_xino_set(sb, &opt->xino,
+				  !!au_ftest_opts(opts->flags, REMOUNT));
+		if (unlikely(err))
+			break;
+
+		*opt_xino = &opt->xino;
+		au_xino_brid_set(sb, -1);
+
+		/* safe d_parent access */
+		parent = opt->xino.file->f_path.dentry->d_parent;
+		root = sb->s_root;
+		bend = au_sbend(sb);
+		for (bindex = 0; bindex <= bend; bindex++) {
+			h_root = au_h_dptr(root, bindex);
+			if (h_root == parent) {
+				au_xino_brid_set(sb, au_sbr_id(sb, bindex));
+				break;
+			}
+		}
+		break;
+
+	case Opt_noxino:
+		au_xino_clr(sb);
+		au_xino_brid_set(sb, -1);
+		*opt_xino = (void *)-1;
+		break;
+	}
+
+	return err;
+}
+
+int au_opts_verify(struct super_block *sb, unsigned long sb_flags,
+		   unsigned int pending)
+{
+	int err, fhsm;
+	aufs_bindex_t bindex, bend;
+	unsigned char do_plink, skip, do_free, can_no_dreval;
+	struct au_branch *br;
+	struct au_wbr *wbr;
+	struct dentry *root, *dentry;
+	struct inode *dir, *h_dir;
+	struct au_sbinfo *sbinfo;
+	struct au_hinode *hdir;
+
+	SiMustAnyLock(sb);
+
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA));
+
+	if (!(sb_flags & MS_RDONLY)) {
+		if (unlikely(!au_br_writable(au_sbr_perm(sb, 0))))
+			pr_warn("first branch should be rw\n");
+		if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH)))
+			pr_warn_once("shwh should be used with ro\n");
+	}
+
+	if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HNOTIFY)
+	    && !au_opt_test(sbinfo->si_mntflags, XINO))
+		pr_warn_once("udba=*notify requires xino\n");
+
+	if (au_opt_test(sbinfo->si_mntflags, DIRPERM1))
+		pr_warn_once("dirperm1 breaks the protection"
+			     " by the permission bits on the lower branch\n");
+
+	err = 0;
+	fhsm = 0;
+	root = sb->s_root;
+	dir = d_inode(root);
+	do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK);
+	can_no_dreval = !!au_opt_test((sbinfo->si_mntflags | pending),
+				      UDBA_NONE);
+	bend = au_sbend(sb);
+	for (bindex = 0; !err && bindex <= bend; bindex++) {
+		skip = 0;
+		h_dir = au_h_iptr(dir, bindex);
+		br = au_sbr(sb, bindex);
+
+		if ((br->br_perm & AuBrAttr_ICEX)
+		    && !h_dir->i_op->listxattr)
+			br->br_perm &= ~AuBrAttr_ICEX;
+#if 0
+		if ((br->br_perm & AuBrAttr_ICEX_SEC)
+		    && (au_br_sb(br)->s_flags & MS_NOSEC))
+			br->br_perm &= ~AuBrAttr_ICEX_SEC;
+#endif
+
+		do_free = 0;
+		wbr = br->br_wbr;
+		if (wbr)
+			wbr_wh_read_lock(wbr);
+
+		if (!au_br_writable(br->br_perm)) {
+			do_free = !!wbr;
+			skip = (!wbr
+				|| (!wbr->wbr_whbase
+				    && !wbr->wbr_plink
+				    && !wbr->wbr_orph));
+		} else if (!au_br_wh_linkable(br->br_perm)) {
+			/* skip = (!br->br_whbase && !br->br_orph); */
+			skip = (!wbr || !wbr->wbr_whbase);
+			if (skip && wbr) {
+				if (do_plink)
+					skip = !!wbr->wbr_plink;
+				else
+					skip = !wbr->wbr_plink;
+			}
+		} else {
+			/* skip = (br->br_whbase && br->br_ohph); */
+			skip = (wbr && wbr->wbr_whbase);
+			if (skip) {
+				if (do_plink)
+					skip = !!wbr->wbr_plink;
+				else
+					skip = !wbr->wbr_plink;
+			}
+		}
+		if (wbr)
+			wbr_wh_read_unlock(wbr);
+
+		if (can_no_dreval) {
+			dentry = br->br_path.dentry;
+			spin_lock(&dentry->d_lock);
+			if (dentry->d_flags &
+			    (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE))
+				can_no_dreval = 0;
+			spin_unlock(&dentry->d_lock);
+		}
+
+		if (au_br_fhsm(br->br_perm)) {
+			fhsm++;
+			AuDebugOn(!br->br_fhsm);
+		}
+
+		if (skip)
+			continue;
+
+		hdir = au_hi(dir, bindex);
+		au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
+		if (wbr)
+			wbr_wh_write_lock(wbr);
+		err = au_wh_init(br, sb);
+		if (wbr)
+			wbr_wh_write_unlock(wbr);
+		au_hn_imtx_unlock(hdir);
+
+		if (!err && do_free) {
+			kfree(wbr);
+			br->br_wbr = NULL;
+		}
+	}
+
+	if (can_no_dreval)
+		au_fset_si(sbinfo, NO_DREVAL);
+	else
+		au_fclr_si(sbinfo, NO_DREVAL);
+
+	if (fhsm >= 2) {
+		au_fset_si(sbinfo, FHSM);
+		for (bindex = bend; bindex >= 0; bindex--) {
+			br = au_sbr(sb, bindex);
+			if (au_br_fhsm(br->br_perm)) {
+				au_fhsm_set_bottom(sb, bindex);
+				break;
+			}
+		}
+	} else {
+		au_fclr_si(sbinfo, FHSM);
+		au_fhsm_set_bottom(sb, -1);
+	}
+
+	return err;
+}
+
+int au_opts_mount(struct super_block *sb, struct au_opts *opts)
+{
+	int err;
+	unsigned int tmp;
+	aufs_bindex_t bindex, bend;
+	struct au_opt *opt;
+	struct au_opt_xino *opt_xino, xino;
+	struct au_sbinfo *sbinfo;
+	struct au_branch *br;
+	struct inode *dir;
+
+	SiMustWriteLock(sb);
+
+	err = 0;
+	opt_xino = NULL;
+	opt = opts->opt;
+	while (err >= 0 && opt->type != Opt_tail)
+		err = au_opt_simple(sb, opt++, opts);
+	if (err > 0)
+		err = 0;
+	else if (unlikely(err < 0))
+		goto out;
+
+	/* disable xino and udba temporary */
+	sbinfo = au_sbi(sb);
+	tmp = sbinfo->si_mntflags;
+	au_opt_clr(sbinfo->si_mntflags, XINO);
+	au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL);
+
+	opt = opts->opt;
+	while (err >= 0 && opt->type != Opt_tail)
+		err = au_opt_br(sb, opt++, opts);
+	if (err > 0)
+		err = 0;
+	else if (unlikely(err < 0))
+		goto out;
+
+	bend = au_sbend(sb);
+	if (unlikely(bend < 0)) {
+		err = -EINVAL;
+		pr_err("no branches\n");
+		goto out;
+	}
+
+	if (au_opt_test(tmp, XINO))
+		au_opt_set(sbinfo->si_mntflags, XINO);
+	opt = opts->opt;
+	while (!err && opt->type != Opt_tail)
+		err = au_opt_xino(sb, opt++, &opt_xino, opts);
+	if (unlikely(err))
+		goto out;
+
+	err = au_opts_verify(sb, sb->s_flags, tmp);
+	if (unlikely(err))
+		goto out;
+
+	/* restore xino */
+	if (au_opt_test(tmp, XINO) && !opt_xino) {
+		xino.file = au_xino_def(sb);
+		err = PTR_ERR(xino.file);
+		if (IS_ERR(xino.file))
+			goto out;
+
+		err = au_xino_set(sb, &xino, /*remount*/0);
+		fput(xino.file);
+		if (unlikely(err))
+			goto out;
+	}
+
+	/* restore udba */
+	tmp &= AuOptMask_UDBA;
+	sbinfo->si_mntflags &= ~AuOptMask_UDBA;
+	sbinfo->si_mntflags |= tmp;
+	bend = au_sbend(sb);
+	for (bindex = 0; bindex <= bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		err = au_hnotify_reset_br(tmp, br, br->br_perm);
+		if (unlikely(err))
+			AuIOErr("hnotify failed on br %d, %d, ignored\n",
+				bindex, err);
+		/* go on even if err */
+	}
+	if (au_opt_test(tmp, UDBA_HNOTIFY)) {
+		dir = d_inode(sb->s_root);
+		au_hn_reset(dir, au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO);
+	}
+
+out:
+	return err;
+}
+
+int au_opts_remount(struct super_block *sb, struct au_opts *opts)
+{
+	int err, rerr;
+	unsigned char no_dreval;
+	struct inode *dir;
+	struct au_opt_xino *opt_xino;
+	struct au_opt *opt;
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	err = 0;
+	dir = d_inode(sb->s_root);
+	sbinfo = au_sbi(sb);
+	opt_xino = NULL;
+	opt = opts->opt;
+	while (err >= 0 && opt->type != Opt_tail) {
+		err = au_opt_simple(sb, opt, opts);
+		if (!err)
+			err = au_opt_br(sb, opt, opts);
+		if (!err)
+			err = au_opt_xino(sb, opt, &opt_xino, opts);
+		opt++;
+	}
+	if (err > 0)
+		err = 0;
+	AuTraceErr(err);
+	/* go on even err */
+
+	no_dreval = !!au_ftest_si(sbinfo, NO_DREVAL);
+	rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0);
+	if (unlikely(rerr && !err))
+		err = rerr;
+
+	if (no_dreval != !!au_ftest_si(sbinfo, NO_DREVAL))
+		au_fset_opts(opts->flags, REFRESH_IDOP);
+
+	if (au_ftest_opts(opts->flags, TRUNC_XIB)) {
+		rerr = au_xib_trunc(sb);
+		if (unlikely(rerr && !err))
+			err = rerr;
+	}
+
+	/* will be handled by the caller */
+	if (!au_ftest_opts(opts->flags, REFRESH)
+	    && (opts->given_udba
+		|| au_opt_test(sbinfo->si_mntflags, XINO)
+		|| au_ftest_opts(opts->flags, REFRESH_IDOP)
+		    ))
+		au_fset_opts(opts->flags, REFRESH);
+
+	AuDbg("status 0x%x\n", opts->flags);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+unsigned int au_opt_udba(struct super_block *sb)
+{
+	return au_mntflags(sb) & AuOptMask_UDBA;
+}
diff --git b/fs/aufs/opts.h b/fs/aufs/opts.h
new file mode 100644
index 0000000..0d6c2e1
--- /dev/null
+++ b/fs/aufs/opts.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * mount options/flags
+ */
+
+#ifndef __AUFS_OPTS_H__
+#define __AUFS_OPTS_H__
+
+#ifdef __KERNEL__
+
+#include <linux/path.h>
+
+struct file;
+struct super_block;
+
+/* ---------------------------------------------------------------------- */
+
+/* mount flags */
+#define AuOpt_XINO		1		/* external inode number bitmap
+						   and translation table */
+#define AuOpt_TRUNC_XINO	(1 << 1)	/* truncate xino files */
+#define AuOpt_UDBA_NONE		(1 << 2)	/* users direct branch access */
+#define AuOpt_UDBA_REVAL	(1 << 3)
+#define AuOpt_UDBA_HNOTIFY	(1 << 4)
+#define AuOpt_SHWH		(1 << 5)	/* show whiteout */
+#define AuOpt_PLINK		(1 << 6)	/* pseudo-link */
+#define AuOpt_DIRPERM1		(1 << 7)	/* ignore the lower dir's perm
+						   bits */
+#define AuOpt_ALWAYS_DIROPQ	(1 << 9)	/* policy to creating diropq */
+#define AuOpt_SUM		(1 << 10)	/* summation for statfs(2) */
+#define AuOpt_SUM_W		(1 << 11)	/* unimplemented */
+#define AuOpt_WARN_PERM		(1 << 12)	/* warn when add-branch */
+#define AuOpt_VERBOSE		(1 << 13)	/* busy inode when del-branch */
+#define AuOpt_DIO		(1 << 14)	/* direct io */
+
+#ifndef CONFIG_AUFS_HNOTIFY
+#undef AuOpt_UDBA_HNOTIFY
+#define AuOpt_UDBA_HNOTIFY	0
+#endif
+#ifndef CONFIG_AUFS_SHWH
+#undef AuOpt_SHWH
+#define AuOpt_SHWH		0
+#endif
+
+#define AuOpt_Def	(AuOpt_XINO \
+			 | AuOpt_UDBA_REVAL \
+			 | AuOpt_PLINK \
+			 /* | AuOpt_DIRPERM1 */ \
+			 | AuOpt_WARN_PERM)
+#define AuOptMask_UDBA	(AuOpt_UDBA_NONE \
+			 | AuOpt_UDBA_REVAL \
+			 | AuOpt_UDBA_HNOTIFY)
+
+#define au_opt_test(flags, name)	(flags & AuOpt_##name)
+#define au_opt_set(flags, name) do { \
+	BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \
+	((flags) |= AuOpt_##name); \
+} while (0)
+#define au_opt_set_udba(flags, name) do { \
+	(flags) &= ~AuOptMask_UDBA; \
+	((flags) |= AuOpt_##name); \
+} while (0)
+#define au_opt_clr(flags, name) do { \
+	((flags) &= ~AuOpt_##name); \
+} while (0)
+
+static inline unsigned int au_opts_plink(unsigned int mntflags)
+{
+#ifdef CONFIG_PROC_FS
+	return mntflags;
+#else
+	return mntflags & ~AuOpt_PLINK;
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* policies to select one among multiple writable branches */
+enum {
+	AuWbrCreate_TDP,	/* top down parent */
+	AuWbrCreate_RR,		/* round robin */
+	AuWbrCreate_MFS,	/* most free space */
+	AuWbrCreate_MFSV,	/* mfs with seconds */
+	AuWbrCreate_MFSRR,	/* mfs then rr */
+	AuWbrCreate_MFSRRV,	/* mfs then rr with seconds */
+	AuWbrCreate_PMFS,	/* parent and mfs */
+	AuWbrCreate_PMFSV,	/* parent and mfs with seconds */
+	AuWbrCreate_PMFSRR,	/* parent, mfs and round-robin */
+	AuWbrCreate_PMFSRRV,	/* plus seconds */
+
+	AuWbrCreate_Def = AuWbrCreate_TDP
+};
+
+enum {
+	AuWbrCopyup_TDP,	/* top down parent */
+	AuWbrCopyup_BUP,	/* bottom up parent */
+	AuWbrCopyup_BU,		/* bottom up */
+
+	AuWbrCopyup_Def = AuWbrCopyup_TDP
+};
+
+/* ---------------------------------------------------------------------- */
+
+struct au_opt_add {
+	aufs_bindex_t	bindex;
+	char		*pathname;
+	int		perm;
+	struct path	path;
+};
+
+struct au_opt_del {
+	char		*pathname;
+	struct path	h_path;
+};
+
+struct au_opt_mod {
+	char		*path;
+	int		perm;
+	struct dentry	*h_root;
+};
+
+struct au_opt_xino {
+	char		*path;
+	struct file	*file;
+};
+
+struct au_opt_xino_itrunc {
+	aufs_bindex_t	bindex;
+};
+
+struct au_opt_wbr_create {
+	int			wbr_create;
+	int			mfs_second;
+	unsigned long long	mfsrr_watermark;
+};
+
+struct au_opt {
+	int type;
+	union {
+		struct au_opt_xino	xino;
+		struct au_opt_xino_itrunc xino_itrunc;
+		struct au_opt_add	add;
+		struct au_opt_del	del;
+		struct au_opt_mod	mod;
+		int			dirwh;
+		int			rdcache;
+		unsigned int		rdblk;
+		unsigned int		rdhash;
+		int			udba;
+		struct au_opt_wbr_create wbr_create;
+		int			wbr_copyup;
+		unsigned int		fhsm_second;
+	};
+};
+
+/* opts flags */
+#define AuOpts_REMOUNT		1
+#define AuOpts_REFRESH		(1 << 1)
+#define AuOpts_TRUNC_XIB	(1 << 2)
+#define AuOpts_REFRESH_DYAOP	(1 << 3)
+#define AuOpts_REFRESH_IDOP	(1 << 4)
+#define au_ftest_opts(flags, name)	((flags) & AuOpts_##name)
+#define au_fset_opts(flags, name) \
+	do { (flags) |= AuOpts_##name; } while (0)
+#define au_fclr_opts(flags, name) \
+	do { (flags) &= ~AuOpts_##name; } while (0)
+
+struct au_opts {
+	struct au_opt	*opt;
+	int		max_opt;
+
+	unsigned int	given_udba;
+	unsigned int	flags;
+	unsigned long	sb_flags;
+};
+
+/* ---------------------------------------------------------------------- */
+
+/* opts.c */
+void au_optstr_br_perm(au_br_perm_str_t *str, int perm);
+const char *au_optstr_udba(int udba);
+const char *au_optstr_wbr_copyup(int wbr_copyup);
+const char *au_optstr_wbr_create(int wbr_create);
+
+void au_opts_free(struct au_opts *opts);
+int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts);
+int au_opts_verify(struct super_block *sb, unsigned long sb_flags,
+		   unsigned int pending);
+int au_opts_mount(struct super_block *sb, struct au_opts *opts);
+int au_opts_remount(struct super_block *sb, struct au_opts *opts);
+
+unsigned int au_opt_udba(struct super_block *sb);
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_OPTS_H__ */
diff --git b/fs/aufs/plink.c b/fs/aufs/plink.c
new file mode 100644
index 0000000..b6d6326
--- /dev/null
+++ b/fs/aufs/plink.c
@@ -0,0 +1,489 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * pseudo-link
+ */
+
+#include "aufs.h"
+
+/*
+ * the pseudo-link maintenance mode.
+ * during a user process maintains the pseudo-links,
+ * prohibit adding a new plink and branch manipulation.
+ *
+ * Flags
+ * NOPLM:
+ *	For entry functions which will handle plink, and i_mutex is already held
+ *	in VFS.
+ *	They cannot wait and should return an error at once.
+ *	Callers has to check the error.
+ * NOPLMW:
+ *	For entry functions which will handle plink, but i_mutex is not held
+ *	in VFS.
+ *	They can wait the plink maintenance mode to finish.
+ *
+ * They behave like F_SETLK and F_SETLKW.
+ * If the caller never handle plink, then both flags are unnecessary.
+ */
+
+int au_plink_maint(struct super_block *sb, int flags)
+{
+	int err;
+	pid_t pid, ppid;
+	struct au_sbinfo *sbi;
+
+	SiMustAnyLock(sb);
+
+	err = 0;
+	if (!au_opt_test(au_mntflags(sb), PLINK))
+		goto out;
+
+	sbi = au_sbi(sb);
+	pid = sbi->si_plink_maint_pid;
+	if (!pid || pid == current->pid)
+		goto out;
+
+	/* todo: it highly depends upon /sbin/mount.aufs */
+	rcu_read_lock();
+	ppid = task_pid_vnr(rcu_dereference(current->real_parent));
+	rcu_read_unlock();
+	if (pid == ppid)
+		goto out;
+
+	if (au_ftest_lock(flags, NOPLMW)) {
+		/* if there is no i_mutex lock in VFS, we don't need to wait */
+		/* AuDebugOn(!lockdep_depth(current)); */
+		while (sbi->si_plink_maint_pid) {
+			si_read_unlock(sb);
+			/* gave up wake_up_bit() */
+			wait_event(sbi->si_plink_wq, !sbi->si_plink_maint_pid);
+
+			if (au_ftest_lock(flags, FLUSH))
+				au_nwt_flush(&sbi->si_nowait);
+			si_noflush_read_lock(sb);
+		}
+	} else if (au_ftest_lock(flags, NOPLM)) {
+		AuDbg("ppid %d, pid %d\n", ppid, pid);
+		err = -EAGAIN;
+	}
+
+out:
+	return err;
+}
+
+void au_plink_maint_leave(struct au_sbinfo *sbinfo)
+{
+	spin_lock(&sbinfo->si_plink_maint_lock);
+	sbinfo->si_plink_maint_pid = 0;
+	spin_unlock(&sbinfo->si_plink_maint_lock);
+	wake_up_all(&sbinfo->si_plink_wq);
+}
+
+int au_plink_maint_enter(struct super_block *sb)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	/* make sure i am the only one in this fs */
+	si_write_lock(sb, AuLock_FLUSH);
+	if (au_opt_test(au_mntflags(sb), PLINK)) {
+		spin_lock(&sbinfo->si_plink_maint_lock);
+		if (!sbinfo->si_plink_maint_pid)
+			sbinfo->si_plink_maint_pid = current->pid;
+		else
+			err = -EBUSY;
+		spin_unlock(&sbinfo->si_plink_maint_lock);
+	}
+	si_write_unlock(sb);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_DEBUG
+void au_plink_list(struct super_block *sb)
+{
+	int i;
+	struct au_sbinfo *sbinfo;
+	struct hlist_head *plink_hlist;
+	struct au_icntnr *icntnr;
+
+	SiMustAnyLock(sb);
+
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
+	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
+
+	for (i = 0; i < AuPlink_NHASH; i++) {
+		plink_hlist = &sbinfo->si_plink[i].head;
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(icntnr, plink_hlist, plink)
+			AuDbg("%lu\n", icntnr->vfs_inode.i_ino);
+		rcu_read_unlock();
+	}
+}
+#endif
+
+/* is the inode pseudo-linked? */
+int au_plink_test(struct inode *inode)
+{
+	int found, i;
+	struct au_sbinfo *sbinfo;
+	struct hlist_head *plink_hlist;
+	struct au_icntnr *icntnr;
+
+	sbinfo = au_sbi(inode->i_sb);
+	AuRwMustAnyLock(&sbinfo->si_rwsem);
+	AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK));
+	AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM));
+
+	found = 0;
+	i = au_plink_hash(inode->i_ino);
+	plink_hlist = &sbinfo->si_plink[i].head;
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(icntnr, plink_hlist, plink)
+		if (&icntnr->vfs_inode == inode) {
+			found = 1;
+			break;
+		}
+	rcu_read_unlock();
+	return found;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * generate a name for plink.
+ * the file will be stored under AUFS_WH_PLINKDIR.
+ */
+/* 20 is max digits length of ulong 64 */
+#define PLINK_NAME_LEN	((20 + 1) * 2)
+
+static int plink_name(char *name, int len, struct inode *inode,
+		      aufs_bindex_t bindex)
+{
+	int rlen;
+	struct inode *h_inode;
+
+	h_inode = au_h_iptr(inode, bindex);
+	rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino);
+	return rlen;
+}
+
+struct au_do_plink_lkup_args {
+	struct dentry **errp;
+	struct qstr *tgtname;
+	struct dentry *h_parent;
+	struct au_branch *br;
+};
+
+static struct dentry *au_do_plink_lkup(struct qstr *tgtname,
+				       struct dentry *h_parent,
+				       struct au_branch *br)
+{
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+
+	h_inode = d_inode(h_parent);
+	inode_lock_nested(h_inode, AuLsc_I_CHILD2);
+	h_dentry = vfsub_lkup_one(tgtname, h_parent);
+	inode_unlock(h_inode);
+	return h_dentry;
+}
+
+static void au_call_do_plink_lkup(void *args)
+{
+	struct au_do_plink_lkup_args *a = args;
+	*a->errp = au_do_plink_lkup(a->tgtname, a->h_parent, a->br);
+}
+
+/* lookup the plink-ed @inode under the branch at @bindex */
+struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex)
+{
+	struct dentry *h_dentry, *h_parent;
+	struct au_branch *br;
+	int wkq_err;
+	char a[PLINK_NAME_LEN];
+	struct qstr tgtname = QSTR_INIT(a, 0);
+
+	AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM));
+
+	br = au_sbr(inode->i_sb, bindex);
+	h_parent = br->br_wbr->wbr_plink;
+	tgtname.len = plink_name(a, sizeof(a), inode, bindex);
+
+	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) {
+		struct au_do_plink_lkup_args args = {
+			.errp		= &h_dentry,
+			.tgtname	= &tgtname,
+			.h_parent	= h_parent,
+			.br		= br
+		};
+
+		wkq_err = au_wkq_wait(au_call_do_plink_lkup, &args);
+		if (unlikely(wkq_err))
+			h_dentry = ERR_PTR(wkq_err);
+	} else
+		h_dentry = au_do_plink_lkup(&tgtname, h_parent, br);
+
+	return h_dentry;
+}
+
+/* create a pseudo-link */
+static int do_whplink(struct qstr *tgt, struct dentry *h_parent,
+		      struct dentry *h_dentry, struct au_branch *br)
+{
+	int err;
+	struct path h_path = {
+		.mnt = au_br_mnt(br)
+	};
+	struct inode *h_dir, *delegated;
+
+	h_dir = d_inode(h_parent);
+	inode_lock_nested(h_dir, AuLsc_I_CHILD2);
+again:
+	h_path.dentry = vfsub_lkup_one(tgt, h_parent);
+	err = PTR_ERR(h_path.dentry);
+	if (IS_ERR(h_path.dentry))
+		goto out;
+
+	err = 0;
+	/* wh.plink dir is not monitored */
+	/* todo: is it really safe? */
+	if (d_is_positive(h_path.dentry)
+	    && d_inode(h_path.dentry) != d_inode(h_dentry)) {
+		delegated = NULL;
+		err = vfsub_unlink(h_dir, &h_path, &delegated, /*force*/0);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal unlink\n");
+			iput(delegated);
+		}
+		dput(h_path.dentry);
+		h_path.dentry = NULL;
+		if (!err)
+			goto again;
+	}
+	if (!err && d_is_negative(h_path.dentry)) {
+		delegated = NULL;
+		err = vfsub_link(h_dentry, h_dir, &h_path, &delegated);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal link\n");
+			iput(delegated);
+		}
+	}
+	dput(h_path.dentry);
+
+out:
+	inode_unlock(h_dir);
+	return err;
+}
+
+struct do_whplink_args {
+	int *errp;
+	struct qstr *tgt;
+	struct dentry *h_parent;
+	struct dentry *h_dentry;
+	struct au_branch *br;
+};
+
+static void call_do_whplink(void *args)
+{
+	struct do_whplink_args *a = args;
+	*a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br);
+}
+
+static int whplink(struct dentry *h_dentry, struct inode *inode,
+		   aufs_bindex_t bindex, struct au_branch *br)
+{
+	int err, wkq_err;
+	struct au_wbr *wbr;
+	struct dentry *h_parent;
+	char a[PLINK_NAME_LEN];
+	struct qstr tgtname = QSTR_INIT(a, 0);
+
+	wbr = au_sbr(inode->i_sb, bindex)->br_wbr;
+	h_parent = wbr->wbr_plink;
+	tgtname.len = plink_name(a, sizeof(a), inode, bindex);
+
+	/* always superio. */
+	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) {
+		struct do_whplink_args args = {
+			.errp		= &err,
+			.tgt		= &tgtname,
+			.h_parent	= h_parent,
+			.h_dentry	= h_dentry,
+			.br		= br
+		};
+		wkq_err = au_wkq_wait(call_do_whplink, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	} else
+		err = do_whplink(&tgtname, h_parent, h_dentry, br);
+
+	return err;
+}
+
+/*
+ * create a new pseudo-link for @h_dentry on @bindex.
+ * the linked inode is held in aufs @inode.
+ */
+void au_plink_append(struct inode *inode, aufs_bindex_t bindex,
+		     struct dentry *h_dentry)
+{
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+	struct hlist_head *plink_hlist;
+	struct au_icntnr *icntnr;
+	struct au_sphlhead *sphl;
+	int found, err, cnt, i;
+
+	sb = inode->i_sb;
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
+	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
+
+	found = au_plink_test(inode);
+	if (found)
+		return;
+
+	i = au_plink_hash(inode->i_ino);
+	sphl = sbinfo->si_plink + i;
+	plink_hlist = &sphl->head;
+	au_igrab(inode);
+
+	spin_lock(&sphl->spin);
+	hlist_for_each_entry(icntnr, plink_hlist, plink) {
+		if (&icntnr->vfs_inode == inode) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found) {
+		icntnr = container_of(inode, struct au_icntnr, vfs_inode);
+		hlist_add_head_rcu(&icntnr->plink, plink_hlist);
+	}
+	spin_unlock(&sphl->spin);
+	if (!found) {
+		cnt = au_sphl_count(sphl);
+#define msg "unexpectedly unblanced or too many pseudo-links"
+		if (cnt > AUFS_PLINK_WARN)
+			AuWarn1(msg ", %d\n", cnt);
+#undef msg
+		err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex));
+		if (unlikely(err)) {
+			pr_warn("err %d, damaged pseudo link.\n", err);
+			au_sphl_del_rcu(&icntnr->plink, sphl);
+			iput(&icntnr->vfs_inode);
+		}
+	} else
+		iput(&icntnr->vfs_inode);
+}
+
+/* free all plinks */
+void au_plink_put(struct super_block *sb, int verbose)
+{
+	int i, warned;
+	struct au_sbinfo *sbinfo;
+	struct hlist_head *plink_hlist;
+	struct hlist_node *tmp;
+	struct au_icntnr *icntnr;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
+	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
+
+	/* no spin_lock since sbinfo is write-locked */
+	warned = 0;
+	for (i = 0; i < AuPlink_NHASH; i++) {
+		plink_hlist = &sbinfo->si_plink[i].head;
+		if (!warned && verbose && !hlist_empty(plink_hlist)) {
+			pr_warn("pseudo-link is not flushed");
+			warned = 1;
+		}
+		hlist_for_each_entry_safe(icntnr, tmp, plink_hlist, plink)
+			iput(&icntnr->vfs_inode);
+		INIT_HLIST_HEAD(plink_hlist);
+	}
+}
+
+void au_plink_clean(struct super_block *sb, int verbose)
+{
+	struct dentry *root;
+
+	root = sb->s_root;
+	aufs_write_lock(root);
+	if (au_opt_test(au_mntflags(sb), PLINK))
+		au_plink_put(sb, verbose);
+	aufs_write_unlock(root);
+}
+
+static int au_plink_do_half_refresh(struct inode *inode, aufs_bindex_t br_id)
+{
+	int do_put;
+	aufs_bindex_t bstart, bend, bindex;
+
+	do_put = 0;
+	bstart = au_ibstart(inode);
+	bend = au_ibend(inode);
+	if (bstart >= 0) {
+		for (bindex = bstart; bindex <= bend; bindex++) {
+			if (!au_h_iptr(inode, bindex)
+			    || au_ii_br_id(inode, bindex) != br_id)
+				continue;
+			au_set_h_iptr(inode, bindex, NULL, 0);
+			do_put = 1;
+			break;
+		}
+		if (do_put)
+			for (bindex = bstart; bindex <= bend; bindex++)
+				if (au_h_iptr(inode, bindex)) {
+					do_put = 0;
+					break;
+				}
+	} else
+		do_put = 1;
+
+	return do_put;
+}
+
+/* free the plinks on a branch specified by @br_id */
+void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id)
+{
+	struct au_sbinfo *sbinfo;
+	struct hlist_head *plink_hlist;
+	struct hlist_node *tmp;
+	struct au_icntnr *icntnr;
+	struct inode *inode;
+	int i, do_put;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
+	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
+
+	/* no spin_lock since sbinfo is write-locked */
+	for (i = 0; i < AuPlink_NHASH; i++) {
+		plink_hlist = &sbinfo->si_plink[i].head;
+		hlist_for_each_entry_safe(icntnr, tmp, plink_hlist, plink) {
+			inode = au_igrab(&icntnr->vfs_inode);
+			ii_write_lock_child(inode);
+			do_put = au_plink_do_half_refresh(inode, br_id);
+			if (do_put) {
+				hlist_del(&icntnr->plink);
+				iput(inode);
+			}
+			ii_write_unlock(inode);
+			iput(inode);
+		}
+	}
+}
diff --git b/fs/aufs/poll.c b/fs/aufs/poll.c
new file mode 100644
index 0000000..dd2baf5
--- /dev/null
+++ b/fs/aufs/poll.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * poll operation
+ * There is only one filesystem which implements ->poll operation, currently.
+ */
+
+#include "aufs.h"
+
+unsigned int aufs_poll(struct file *file, poll_table *wait)
+{
+	unsigned int mask;
+	int err;
+	struct file *h_file;
+	struct super_block *sb;
+
+	/* We should pretend an error happened. */
+	mask = POLLERR /* | POLLIN | POLLOUT */;
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	/* it is not an error if h_file has no operation */
+	mask = DEFAULT_POLLMASK;
+	if (h_file->f_op->poll)
+		mask = h_file->f_op->poll(h_file, wait);
+	fput(h_file); /* instead of au_read_post() */
+
+out:
+	si_read_unlock(sb);
+	AuTraceErr((int)mask);
+	return mask;
+}
diff --git b/fs/aufs/posix_acl.c b/fs/aufs/posix_acl.c
new file mode 100644
index 0000000..d6eeebd
--- /dev/null
+++ b/fs/aufs/posix_acl.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2014-2016 Junjiro R. Okajima
+ */
+
+/*
+ * posix acl operations
+ */
+
+#include <linux/fs.h>
+#include "aufs.h"
+
+struct posix_acl *aufs_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	int err;
+	aufs_bindex_t bindex;
+	struct inode *h_inode;
+	struct super_block *sb;
+
+	acl = NULL;
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	ii_read_lock_child(inode);
+	if (!(sb->s_flags & MS_POSIXACL))
+		goto out;
+
+	bindex = au_ibstart(inode);
+	h_inode = au_h_iptr(inode, bindex);
+	if (unlikely(!h_inode
+		     || ((h_inode->i_mode & S_IFMT)
+			 != (inode->i_mode & S_IFMT)))) {
+		err = au_busy_or_stale();
+		acl = ERR_PTR(err);
+		goto out;
+	}
+
+	/* always topmost only */
+	acl = get_acl(h_inode, type);
+
+out:
+	ii_read_unlock(inode);
+	si_read_unlock(sb);
+
+	AuTraceErrPtr(acl);
+	return acl;
+}
+
+int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int err;
+	ssize_t ssz;
+	struct dentry *dentry;
+	struct au_srxattr arg = {
+		.type = AU_ACL_SET,
+		.u.acl_set = {
+			.acl	= acl,
+			.type	= type
+		},
+	};
+
+	inode_lock(inode);
+	if (inode->i_ino == AUFS_ROOT_INO)
+		dentry = dget(inode->i_sb->s_root);
+	else {
+		dentry = d_find_alias(inode);
+		if (!dentry)
+			dentry = d_find_any_alias(inode);
+		if (!dentry) {
+			pr_warn("cannot handle this inode, "
+				"please report to aufs-users ML\n");
+			err = -ENOENT;
+			goto out;
+		}
+	}
+
+	ssz = au_srxattr(dentry, &arg);
+	dput(dentry);
+	err = ssz;
+	if (ssz >= 0)
+		err = 0;
+
+out:
+	inode_unlock(inode);
+	return err;
+}
diff --git b/fs/aufs/procfs.c b/fs/aufs/procfs.c
new file mode 100644
index 0000000..e5a4e37
--- /dev/null
+++ b/fs/aufs/procfs.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2010-2016 Junjiro R. Okajima
+ */
+
+/*
+ * procfs interfaces
+ */
+
+#include <linux/proc_fs.h>
+#include "aufs.h"
+
+static int au_procfs_plm_release(struct inode *inode, struct file *file)
+{
+	struct au_sbinfo *sbinfo;
+
+	sbinfo = file->private_data;
+	if (sbinfo) {
+		au_plink_maint_leave(sbinfo);
+		kobject_put(&sbinfo->si_kobj);
+	}
+
+	return 0;
+}
+
+static void au_procfs_plm_write_clean(struct file *file)
+{
+	struct au_sbinfo *sbinfo;
+
+	sbinfo = file->private_data;
+	if (sbinfo)
+		au_plink_clean(sbinfo->si_sb, /*verbose*/0);
+}
+
+static int au_procfs_plm_write_si(struct file *file, unsigned long id)
+{
+	int err;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+
+	err = -EBUSY;
+	if (unlikely(file->private_data))
+		goto out;
+
+	sb = NULL;
+	/* don't use au_sbilist_lock() here */
+	spin_lock(&au_sbilist.spin);
+	hlist_for_each_entry(sbinfo, &au_sbilist.head, si_list)
+		if (id == sysaufs_si_id(sbinfo)) {
+			kobject_get(&sbinfo->si_kobj);
+			sb = sbinfo->si_sb;
+			break;
+		}
+	spin_unlock(&au_sbilist.spin);
+
+	err = -EINVAL;
+	if (unlikely(!sb))
+		goto out;
+
+	err = au_plink_maint_enter(sb);
+	if (!err)
+		/* keep kobject_get() */
+		file->private_data = sbinfo;
+	else
+		kobject_put(&sbinfo->si_kobj);
+out:
+	return err;
+}
+
+/*
+ * Accept a valid "si=xxxx" only.
+ * Once it is accepted successfully, accept "clean" too.
+ */
+static ssize_t au_procfs_plm_write(struct file *file, const char __user *ubuf,
+				   size_t count, loff_t *ppos)
+{
+	ssize_t err;
+	unsigned long id;
+	/* last newline is allowed */
+	char buf[3 + sizeof(unsigned long) * 2 + 1];
+
+	err = -EACCES;
+	if (unlikely(!capable(CAP_SYS_ADMIN)))
+		goto out;
+
+	err = -EINVAL;
+	if (unlikely(count > sizeof(buf)))
+		goto out;
+
+	err = copy_from_user(buf, ubuf, count);
+	if (unlikely(err)) {
+		err = -EFAULT;
+		goto out;
+	}
+	buf[count] = 0;
+
+	err = -EINVAL;
+	if (!strcmp("clean", buf)) {
+		au_procfs_plm_write_clean(file);
+		goto out_success;
+	} else if (unlikely(strncmp("si=", buf, 3)))
+		goto out;
+
+	err = kstrtoul(buf + 3, 16, &id);
+	if (unlikely(err))
+		goto out;
+
+	err = au_procfs_plm_write_si(file, id);
+	if (unlikely(err))
+		goto out;
+
+out_success:
+	err = count; /* success */
+out:
+	return err;
+}
+
+static const struct file_operations au_procfs_plm_fop = {
+	.write		= au_procfs_plm_write,
+	.release	= au_procfs_plm_release,
+	.owner		= THIS_MODULE
+};
+
+/* ---------------------------------------------------------------------- */
+
+static struct proc_dir_entry *au_procfs_dir;
+
+void au_procfs_fin(void)
+{
+	remove_proc_entry(AUFS_PLINK_MAINT_NAME, au_procfs_dir);
+	remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL);
+}
+
+int __init au_procfs_init(void)
+{
+	int err;
+	struct proc_dir_entry *entry;
+
+	err = -ENOMEM;
+	au_procfs_dir = proc_mkdir(AUFS_PLINK_MAINT_DIR, NULL);
+	if (unlikely(!au_procfs_dir))
+		goto out;
+
+	entry = proc_create(AUFS_PLINK_MAINT_NAME, S_IFREG | S_IWUSR,
+			    au_procfs_dir, &au_procfs_plm_fop);
+	if (unlikely(!entry))
+		goto out_dir;
+
+	err = 0;
+	goto out; /* success */
+
+
+out_dir:
+	remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL);
+out:
+	return err;
+}
diff --git b/fs/aufs/rdu.c b/fs/aufs/rdu.c
new file mode 100644
index 0000000..ec08446
--- /dev/null
+++ b/fs/aufs/rdu.c
@@ -0,0 +1,376 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * readdir in userspace.
+ */
+
+#include <linux/compat.h>
+#include <linux/fs_stack.h>
+#include <linux/security.h>
+#include "aufs.h"
+
+/* bits for struct aufs_rdu.flags */
+#define	AuRdu_CALLED	1
+#define	AuRdu_CONT	(1 << 1)
+#define	AuRdu_FULL	(1 << 2)
+#define au_ftest_rdu(flags, name)	((flags) & AuRdu_##name)
+#define au_fset_rdu(flags, name) \
+	do { (flags) |= AuRdu_##name; } while (0)
+#define au_fclr_rdu(flags, name) \
+	do { (flags) &= ~AuRdu_##name; } while (0)
+
+struct au_rdu_arg {
+	struct dir_context		ctx;
+	struct aufs_rdu			*rdu;
+	union au_rdu_ent_ul		ent;
+	unsigned long			end;
+
+	struct super_block		*sb;
+	int				err;
+};
+
+static int au_rdu_fill(struct dir_context *ctx, const char *name, int nlen,
+		       loff_t offset, u64 h_ino, unsigned int d_type)
+{
+	int err, len;
+	struct au_rdu_arg *arg = container_of(ctx, struct au_rdu_arg, ctx);
+	struct aufs_rdu *rdu = arg->rdu;
+	struct au_rdu_ent ent;
+
+	err = 0;
+	arg->err = 0;
+	au_fset_rdu(rdu->cookie.flags, CALLED);
+	len = au_rdu_len(nlen);
+	if (arg->ent.ul + len  < arg->end) {
+		ent.ino = h_ino;
+		ent.bindex = rdu->cookie.bindex;
+		ent.type = d_type;
+		ent.nlen = nlen;
+		if (unlikely(nlen > AUFS_MAX_NAMELEN))
+			ent.type = DT_UNKNOWN;
+
+		/* unnecessary to support mmap_sem since this is a dir */
+		err = -EFAULT;
+		if (copy_to_user(arg->ent.e, &ent, sizeof(ent)))
+			goto out;
+		if (copy_to_user(arg->ent.e->name, name, nlen))
+			goto out;
+		/* the terminating NULL */
+		if (__put_user(0, arg->ent.e->name + nlen))
+			goto out;
+		err = 0;
+		/* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */
+		arg->ent.ul += len;
+		rdu->rent++;
+	} else {
+		err = -EFAULT;
+		au_fset_rdu(rdu->cookie.flags, FULL);
+		rdu->full = 1;
+		rdu->tail = arg->ent;
+	}
+
+out:
+	/* AuTraceErr(err); */
+	return err;
+}
+
+static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg)
+{
+	int err;
+	loff_t offset;
+	struct au_rdu_cookie *cookie = &arg->rdu->cookie;
+
+	/* we don't have to care (FMODE_32BITHASH | FMODE_64BITHASH) for ext4 */
+	offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET);
+	err = offset;
+	if (unlikely(offset != cookie->h_pos))
+		goto out;
+
+	err = 0;
+	do {
+		arg->err = 0;
+		au_fclr_rdu(cookie->flags, CALLED);
+		/* smp_mb(); */
+		err = vfsub_iterate_dir(h_file, &arg->ctx);
+		if (err >= 0)
+			err = arg->err;
+	} while (!err
+		 && au_ftest_rdu(cookie->flags, CALLED)
+		 && !au_ftest_rdu(cookie->flags, FULL));
+	cookie->h_pos = h_file->f_pos;
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_rdu(struct file *file, struct aufs_rdu *rdu)
+{
+	int err;
+	aufs_bindex_t bend;
+	struct au_rdu_arg arg = {
+		.ctx = {
+			.actor = au_rdu_fill
+		}
+	};
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *h_file;
+	struct au_rdu_cookie *cookie = &rdu->cookie;
+
+	err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz);
+	if (unlikely(err)) {
+		err = -EFAULT;
+		AuTraceErr(err);
+		goto out;
+	}
+	rdu->rent = 0;
+	rdu->tail = rdu->ent;
+	rdu->full = 0;
+	arg.rdu = rdu;
+	arg.ent = rdu->ent;
+	arg.end = arg.ent.ul;
+	arg.end += rdu->sz;
+
+	err = -ENOTDIR;
+	if (unlikely(!file->f_op->iterate))
+		goto out;
+
+	err = security_file_permission(file, MAY_READ);
+	AuTraceErr(err);
+	if (unlikely(err))
+		goto out;
+
+	dentry = file->f_path.dentry;
+	inode = d_inode(dentry);
+#if 1
+	inode_lock(inode);
+#else
+	/* todo: create a new inline func inode_lock_killable() */
+	err = mutex_lock_killable(&inode->i_mutex);
+	AuTraceErr(err);
+	if (unlikely(err))
+		goto out;
+#endif
+
+	arg.sb = inode->i_sb;
+	err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out_mtx;
+	err = au_alive_dir(dentry);
+	if (unlikely(err))
+		goto out_si;
+	/* todo: reval? */
+	fi_read_lock(file);
+
+	err = -EAGAIN;
+	if (unlikely(au_ftest_rdu(cookie->flags, CONT)
+		     && cookie->generation != au_figen(file)))
+		goto out_unlock;
+
+	err = 0;
+	if (!rdu->blk) {
+		rdu->blk = au_sbi(arg.sb)->si_rdblk;
+		if (!rdu->blk)
+			rdu->blk = au_dir_size(file, /*dentry*/NULL);
+	}
+	bend = au_fbstart(file);
+	if (cookie->bindex < bend)
+		cookie->bindex = bend;
+	bend = au_fbend_dir(file);
+	/* AuDbg("b%d, b%d\n", cookie->bindex, bend); */
+	for (; !err && cookie->bindex <= bend;
+	     cookie->bindex++, cookie->h_pos = 0) {
+		h_file = au_hf_dir(file, cookie->bindex);
+		if (!h_file)
+			continue;
+
+		au_fclr_rdu(cookie->flags, FULL);
+		err = au_rdu_do(h_file, &arg);
+		AuTraceErr(err);
+		if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err))
+			break;
+	}
+	AuDbg("rent %llu\n", rdu->rent);
+
+	if (!err && !au_ftest_rdu(cookie->flags, CONT)) {
+		rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH);
+		au_fset_rdu(cookie->flags, CONT);
+		cookie->generation = au_figen(file);
+	}
+
+	ii_read_lock_child(inode);
+	fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibstart(inode)));
+	ii_read_unlock(inode);
+
+out_unlock:
+	fi_read_unlock(file);
+out_si:
+	si_read_unlock(arg.sb);
+out_mtx:
+	inode_unlock(inode);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu)
+{
+	int err;
+	ino_t ino;
+	unsigned long long nent;
+	union au_rdu_ent_ul *u;
+	struct au_rdu_ent ent;
+	struct super_block *sb;
+
+	err = 0;
+	nent = rdu->nent;
+	u = &rdu->ent;
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	while (nent-- > 0) {
+		/* unnecessary to support mmap_sem since this is a dir */
+		err = copy_from_user(&ent, u->e, sizeof(ent));
+		if (!err)
+			err = !access_ok(VERIFY_WRITE, &u->e->ino, sizeof(ino));
+		if (unlikely(err)) {
+			err = -EFAULT;
+			AuTraceErr(err);
+			break;
+		}
+
+		/* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */
+		if (!ent.wh)
+			err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino);
+		else
+			err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type,
+					&ino);
+		if (unlikely(err)) {
+			AuTraceErr(err);
+			break;
+		}
+
+		err = __put_user(ino, &u->e->ino);
+		if (unlikely(err)) {
+			err = -EFAULT;
+			AuTraceErr(err);
+			break;
+		}
+		u->ul += au_rdu_len(ent.nlen);
+	}
+	si_read_unlock(sb);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_rdu_verify(struct aufs_rdu *rdu)
+{
+	AuDbg("rdu{%llu, %p, %u | %u | %llu, %u, %u | "
+	      "%llu, b%d, 0x%x, g%u}\n",
+	      rdu->sz, rdu->ent.e, rdu->verify[AufsCtlRduV_SZ],
+	      rdu->blk,
+	      rdu->rent, rdu->shwh, rdu->full,
+	      rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags,
+	      rdu->cookie.generation);
+
+	if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu))
+		return 0;
+
+	AuDbg("%u:%u\n",
+	      rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu));
+	return -EINVAL;
+}
+
+long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long err, e;
+	struct aufs_rdu rdu;
+	void __user *p = (void __user *)arg;
+
+	err = copy_from_user(&rdu, p, sizeof(rdu));
+	if (unlikely(err)) {
+		err = -EFAULT;
+		AuTraceErr(err);
+		goto out;
+	}
+	err = au_rdu_verify(&rdu);
+	if (unlikely(err))
+		goto out;
+
+	switch (cmd) {
+	case AUFS_CTL_RDU:
+		err = au_rdu(file, &rdu);
+		if (unlikely(err))
+			break;
+
+		e = copy_to_user(p, &rdu, sizeof(rdu));
+		if (unlikely(e)) {
+			err = -EFAULT;
+			AuTraceErr(err);
+		}
+		break;
+	case AUFS_CTL_RDU_INO:
+		err = au_rdu_ino(file, &rdu);
+		break;
+
+	default:
+		/* err = -ENOTTY; */
+		err = -EINVAL;
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long err, e;
+	struct aufs_rdu rdu;
+	void __user *p = compat_ptr(arg);
+
+	/* todo: get_user()? */
+	err = copy_from_user(&rdu, p, sizeof(rdu));
+	if (unlikely(err)) {
+		err = -EFAULT;
+		AuTraceErr(err);
+		goto out;
+	}
+	rdu.ent.e = compat_ptr(rdu.ent.ul);
+	err = au_rdu_verify(&rdu);
+	if (unlikely(err))
+		goto out;
+
+	switch (cmd) {
+	case AUFS_CTL_RDU:
+		err = au_rdu(file, &rdu);
+		if (unlikely(err))
+			break;
+
+		rdu.ent.ul = ptr_to_compat(rdu.ent.e);
+		rdu.tail.ul = ptr_to_compat(rdu.tail.e);
+		e = copy_to_user(p, &rdu, sizeof(rdu));
+		if (unlikely(e)) {
+			err = -EFAULT;
+			AuTraceErr(err);
+		}
+		break;
+	case AUFS_CTL_RDU_INO:
+		err = au_rdu_ino(file, &rdu);
+		break;
+
+	default:
+		/* err = -ENOTTY; */
+		err = -EINVAL;
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+#endif
diff --git b/fs/aufs/rwsem.h b/fs/aufs/rwsem.h
new file mode 100644
index 0000000..ef50c2c
--- /dev/null
+++ b/fs/aufs/rwsem.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * simple read-write semaphore wrappers
+ */
+
+#ifndef __AUFS_RWSEM_H__
+#define __AUFS_RWSEM_H__
+
+#ifdef __KERNEL__
+
+#include "debug.h"
+
+struct au_rwsem {
+	struct rw_semaphore	rwsem;
+#ifdef CONFIG_AUFS_DEBUG
+	/* just for debugging, not almighty counter */
+	atomic_t		rcnt, wcnt;
+#endif
+};
+
+#ifdef CONFIG_AUFS_DEBUG
+#define AuDbgCntInit(rw) do { \
+	atomic_set(&(rw)->rcnt, 0); \
+	atomic_set(&(rw)->wcnt, 0); \
+	smp_mb(); /* atomic set */ \
+} while (0)
+
+#define AuDbgRcntInc(rw)	atomic_inc(&(rw)->rcnt)
+#define AuDbgRcntDec(rw)	WARN_ON(atomic_dec_return(&(rw)->rcnt) < 0)
+#define AuDbgWcntInc(rw)	atomic_inc(&(rw)->wcnt)
+#define AuDbgWcntDec(rw)	WARN_ON(atomic_dec_return(&(rw)->wcnt) < 0)
+#else
+#define AuDbgCntInit(rw)	do {} while (0)
+#define AuDbgRcntInc(rw)	do {} while (0)
+#define AuDbgRcntDec(rw)	do {} while (0)
+#define AuDbgWcntInc(rw)	do {} while (0)
+#define AuDbgWcntDec(rw)	do {} while (0)
+#endif /* CONFIG_AUFS_DEBUG */
+
+/* to debug easier, do not make them inlined functions */
+#define AuRwMustNoWaiters(rw)	AuDebugOn(!list_empty(&(rw)->rwsem.wait_list))
+/* rwsem_is_locked() is unusable */
+#define AuRwMustReadLock(rw)	AuDebugOn(atomic_read(&(rw)->rcnt) <= 0)
+#define AuRwMustWriteLock(rw)	AuDebugOn(atomic_read(&(rw)->wcnt) <= 0)
+#define AuRwMustAnyLock(rw)	AuDebugOn(atomic_read(&(rw)->rcnt) <= 0 \
+					&& atomic_read(&(rw)->wcnt) <= 0)
+#define AuRwDestroy(rw)		AuDebugOn(atomic_read(&(rw)->rcnt) \
+					|| atomic_read(&(rw)->wcnt))
+
+#define au_rw_class(rw, key)	lockdep_set_class(&(rw)->rwsem, key)
+
+static inline void au_rw_init(struct au_rwsem *rw)
+{
+	AuDbgCntInit(rw);
+	init_rwsem(&rw->rwsem);
+}
+
+static inline void au_rw_init_wlock(struct au_rwsem *rw)
+{
+	au_rw_init(rw);
+	down_write(&rw->rwsem);
+	AuDbgWcntInc(rw);
+}
+
+static inline void au_rw_init_wlock_nested(struct au_rwsem *rw,
+					   unsigned int lsc)
+{
+	au_rw_init(rw);
+	down_write_nested(&rw->rwsem, lsc);
+	AuDbgWcntInc(rw);
+}
+
+static inline void au_rw_read_lock(struct au_rwsem *rw)
+{
+	down_read(&rw->rwsem);
+	AuDbgRcntInc(rw);
+}
+
+static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc)
+{
+	down_read_nested(&rw->rwsem, lsc);
+	AuDbgRcntInc(rw);
+}
+
+static inline void au_rw_read_unlock(struct au_rwsem *rw)
+{
+	AuRwMustReadLock(rw);
+	AuDbgRcntDec(rw);
+	up_read(&rw->rwsem);
+}
+
+static inline void au_rw_dgrade_lock(struct au_rwsem *rw)
+{
+	AuRwMustWriteLock(rw);
+	AuDbgRcntInc(rw);
+	AuDbgWcntDec(rw);
+	downgrade_write(&rw->rwsem);
+}
+
+static inline void au_rw_write_lock(struct au_rwsem *rw)
+{
+	down_write(&rw->rwsem);
+	AuDbgWcntInc(rw);
+}
+
+static inline void au_rw_write_lock_nested(struct au_rwsem *rw,
+					   unsigned int lsc)
+{
+	down_write_nested(&rw->rwsem, lsc);
+	AuDbgWcntInc(rw);
+}
+
+static inline void au_rw_write_unlock(struct au_rwsem *rw)
+{
+	AuRwMustWriteLock(rw);
+	AuDbgWcntDec(rw);
+	up_write(&rw->rwsem);
+}
+
+/* why is not _nested version defined */
+static inline int au_rw_read_trylock(struct au_rwsem *rw)
+{
+	int ret;
+
+	ret = down_read_trylock(&rw->rwsem);
+	if (ret)
+		AuDbgRcntInc(rw);
+	return ret;
+}
+
+static inline int au_rw_write_trylock(struct au_rwsem *rw)
+{
+	int ret;
+
+	ret = down_write_trylock(&rw->rwsem);
+	if (ret)
+		AuDbgWcntInc(rw);
+	return ret;
+}
+
+#undef AuDbgCntInit
+#undef AuDbgRcntInc
+#undef AuDbgRcntDec
+#undef AuDbgWcntInc
+#undef AuDbgWcntDec
+
+#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \
+static inline void prefix##_read_lock(param) \
+{ au_rw_read_lock(rwsem); } \
+static inline void prefix##_write_lock(param) \
+{ au_rw_write_lock(rwsem); } \
+static inline int prefix##_read_trylock(param) \
+{ return au_rw_read_trylock(rwsem); } \
+static inline int prefix##_write_trylock(param) \
+{ return au_rw_write_trylock(rwsem); }
+/* why is not _nested version defined */
+/* static inline void prefix##_read_trylock_nested(param, lsc)
+{ au_rw_read_trylock_nested(rwsem, lsc)); }
+static inline void prefix##_write_trylock_nestd(param, lsc)
+{ au_rw_write_trylock_nested(rwsem, lsc); } */
+
+#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \
+static inline void prefix##_read_unlock(param) \
+{ au_rw_read_unlock(rwsem); } \
+static inline void prefix##_write_unlock(param) \
+{ au_rw_write_unlock(rwsem); } \
+static inline void prefix##_downgrade_lock(param) \
+{ au_rw_dgrade_lock(rwsem); }
+
+#define AuSimpleRwsemFuncs(prefix, param, rwsem) \
+	AuSimpleLockRwsemFuncs(prefix, param, rwsem) \
+	AuSimpleUnlockRwsemFuncs(prefix, param, rwsem)
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_RWSEM_H__ */
diff --git b/fs/aufs/sbinfo.c b/fs/aufs/sbinfo.c
new file mode 100644
index 0000000..a9d782e
--- /dev/null
+++ b/fs/aufs/sbinfo.c
@@ -0,0 +1,337 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * superblock private data
+ */
+
+#include "aufs.h"
+
+/*
+ * they are necessary regardless sysfs is disabled.
+ */
+void au_si_free(struct kobject *kobj)
+{
+	int i;
+	struct au_sbinfo *sbinfo;
+	char *locked __maybe_unused; /* debug only */
+
+	sbinfo = container_of(kobj, struct au_sbinfo, si_kobj);
+	for (i = 0; i < AuPlink_NHASH; i++)
+		AuDebugOn(!hlist_empty(&sbinfo->si_plink[i].head));
+	AuDebugOn(atomic_read(&sbinfo->si_nowait.nw_len));
+
+	au_rw_write_lock(&sbinfo->si_rwsem);
+	au_br_free(sbinfo);
+	au_rw_write_unlock(&sbinfo->si_rwsem);
+
+	kfree(sbinfo->si_branch);
+	for (i = 0; i < AU_NPIDMAP; i++)
+		kfree(sbinfo->au_si_pid.pid_bitmap[i]);
+	mutex_destroy(&sbinfo->au_si_pid.pid_mtx);
+	mutex_destroy(&sbinfo->si_xib_mtx);
+	AuRwDestroy(&sbinfo->si_rwsem);
+
+	kfree(sbinfo);
+}
+
+int au_si_alloc(struct super_block *sb)
+{
+	int err, i;
+	struct au_sbinfo *sbinfo;
+	static struct lock_class_key aufs_si;
+
+	err = -ENOMEM;
+	sbinfo = kzalloc(sizeof(*sbinfo), GFP_NOFS);
+	if (unlikely(!sbinfo))
+		goto out;
+
+	/* will be reallocated separately */
+	sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS);
+	if (unlikely(!sbinfo->si_branch))
+		goto out_sbinfo;
+
+	err = sysaufs_si_init(sbinfo);
+	if (unlikely(err))
+		goto out_br;
+
+	au_nwt_init(&sbinfo->si_nowait);
+	au_rw_init_wlock(&sbinfo->si_rwsem);
+	au_rw_class(&sbinfo->si_rwsem, &aufs_si);
+	mutex_init(&sbinfo->au_si_pid.pid_mtx);
+
+	atomic_long_set(&sbinfo->si_ninodes, 0);
+	atomic_long_set(&sbinfo->si_nfiles, 0);
+
+	sbinfo->si_bend = -1;
+	sbinfo->si_last_br_id = AUFS_BRANCH_MAX / 2;
+
+	sbinfo->si_wbr_copyup = AuWbrCopyup_Def;
+	sbinfo->si_wbr_create = AuWbrCreate_Def;
+	sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + sbinfo->si_wbr_copyup;
+	sbinfo->si_wbr_create_ops = au_wbr_create_ops + sbinfo->si_wbr_create;
+
+	au_fhsm_init(sbinfo);
+
+	sbinfo->si_mntflags = au_opts_plink(AuOpt_Def);
+
+	sbinfo->si_xino_jiffy = jiffies;
+	sbinfo->si_xino_expire
+		= msecs_to_jiffies(AUFS_XINO_DEF_SEC * MSEC_PER_SEC);
+	mutex_init(&sbinfo->si_xib_mtx);
+	sbinfo->si_xino_brid = -1;
+	/* leave si_xib_last_pindex and si_xib_next_bit */
+
+	au_sphl_init(&sbinfo->si_aopen);
+
+	sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC);
+	sbinfo->si_rdblk = AUFS_RDBLK_DEF;
+	sbinfo->si_rdhash = AUFS_RDHASH_DEF;
+	sbinfo->si_dirwh = AUFS_DIRWH_DEF;
+
+	for (i = 0; i < AuPlink_NHASH; i++)
+		au_sphl_init(sbinfo->si_plink + i);
+	init_waitqueue_head(&sbinfo->si_plink_wq);
+	spin_lock_init(&sbinfo->si_plink_maint_lock);
+
+	au_sphl_init(&sbinfo->si_files);
+
+	/* with getattr by default */
+	sbinfo->si_iop_array = aufs_iop;
+
+	/* leave other members for sysaufs and si_mnt. */
+	sbinfo->si_sb = sb;
+	sb->s_fs_info = sbinfo;
+	si_pid_set(sb);
+	return 0; /* success */
+
+out_br:
+	kfree(sbinfo->si_branch);
+out_sbinfo:
+	kfree(sbinfo);
+out:
+	return err;
+}
+
+int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr)
+{
+	int err, sz;
+	struct au_branch **brp;
+
+	AuRwMustWriteLock(&sbinfo->si_rwsem);
+
+	err = -ENOMEM;
+	sz = sizeof(*brp) * (sbinfo->si_bend + 1);
+	if (unlikely(!sz))
+		sz = sizeof(*brp);
+	brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS);
+	if (brp) {
+		sbinfo->si_branch = brp;
+		err = 0;
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+unsigned int au_sigen_inc(struct super_block *sb)
+{
+	unsigned int gen;
+	struct inode *inode;
+
+	SiMustWriteLock(sb);
+
+	gen = ++au_sbi(sb)->si_generation;
+	au_update_digen(sb->s_root);
+	inode = d_inode(sb->s_root);
+	au_update_iigen(inode, /*half*/0);
+	inode->i_version++;
+	return gen;
+}
+
+aufs_bindex_t au_new_br_id(struct super_block *sb)
+{
+	aufs_bindex_t br_id;
+	int i;
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	for (i = 0; i <= AUFS_BRANCH_MAX; i++) {
+		br_id = ++sbinfo->si_last_br_id;
+		AuDebugOn(br_id < 0);
+		if (br_id && au_br_index(sb, br_id) < 0)
+			return br_id;
+	}
+
+	return -1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* it is ok that new 'nwt' tasks are appended while we are sleeping */
+int si_read_lock(struct super_block *sb, int flags)
+{
+	int err;
+
+	err = 0;
+	if (au_ftest_lock(flags, FLUSH))
+		au_nwt_flush(&au_sbi(sb)->si_nowait);
+
+	si_noflush_read_lock(sb);
+	err = au_plink_maint(sb, flags);
+	if (unlikely(err))
+		si_read_unlock(sb);
+
+	return err;
+}
+
+int si_write_lock(struct super_block *sb, int flags)
+{
+	int err;
+
+	if (au_ftest_lock(flags, FLUSH))
+		au_nwt_flush(&au_sbi(sb)->si_nowait);
+
+	si_noflush_write_lock(sb);
+	err = au_plink_maint(sb, flags);
+	if (unlikely(err))
+		si_write_unlock(sb);
+
+	return err;
+}
+
+/* dentry and super_block lock. call at entry point */
+int aufs_read_lock(struct dentry *dentry, int flags)
+{
+	int err;
+	struct super_block *sb;
+
+	sb = dentry->d_sb;
+	err = si_read_lock(sb, flags);
+	if (unlikely(err))
+		goto out;
+
+	if (au_ftest_lock(flags, DW))
+		di_write_lock_child(dentry);
+	else
+		di_read_lock_child(dentry, flags);
+
+	if (au_ftest_lock(flags, GEN)) {
+		err = au_digen_test(dentry, au_sigen(sb));
+		if (!au_opt_test(au_mntflags(sb), UDBA_NONE))
+			AuDebugOn(!err && au_dbrange_test(dentry));
+		else if (!err)
+			err = au_dbrange_test(dentry);
+		if (unlikely(err))
+			aufs_read_unlock(dentry, flags);
+	}
+
+out:
+	return err;
+}
+
+void aufs_read_unlock(struct dentry *dentry, int flags)
+{
+	if (au_ftest_lock(flags, DW))
+		di_write_unlock(dentry);
+	else
+		di_read_unlock(dentry, flags);
+	si_read_unlock(dentry->d_sb);
+}
+
+void aufs_write_lock(struct dentry *dentry)
+{
+	si_write_lock(dentry->d_sb, AuLock_FLUSH | AuLock_NOPLMW);
+	di_write_lock_child(dentry);
+}
+
+void aufs_write_unlock(struct dentry *dentry)
+{
+	di_write_unlock(dentry);
+	si_write_unlock(dentry->d_sb);
+}
+
+int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags)
+{
+	int err;
+	unsigned int sigen;
+	struct super_block *sb;
+
+	sb = d1->d_sb;
+	err = si_read_lock(sb, flags);
+	if (unlikely(err))
+		goto out;
+
+	di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIRS));
+
+	if (au_ftest_lock(flags, GEN)) {
+		sigen = au_sigen(sb);
+		err = au_digen_test(d1, sigen);
+		AuDebugOn(!err && au_dbrange_test(d1));
+		if (!err) {
+			err = au_digen_test(d2, sigen);
+			AuDebugOn(!err && au_dbrange_test(d2));
+		}
+		if (unlikely(err))
+			aufs_read_and_write_unlock2(d1, d2);
+	}
+
+out:
+	return err;
+}
+
+void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2)
+{
+	di_write_unlock2(d1, d2);
+	si_read_unlock(d1->d_sb);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void si_pid_alloc(struct au_si_pid *au_si_pid, int idx)
+{
+	unsigned long *p;
+
+	BUILD_BUG_ON(sizeof(unsigned long) !=
+		     sizeof(*au_si_pid->pid_bitmap));
+
+	mutex_lock(&au_si_pid->pid_mtx);
+	p = au_si_pid->pid_bitmap[idx];
+	while (!p) {
+		/*
+		 * bad approach.
+		 * but keeping 'si_pid_set()' void is more important.
+		 */
+		p = kcalloc(BITS_TO_LONGS(AU_PIDSTEP),
+			    sizeof(*au_si_pid->pid_bitmap),
+			    GFP_NOFS);
+		if (p)
+			break;
+		cond_resched();
+	}
+	au_si_pid->pid_bitmap[idx] = p;
+	mutex_unlock(&au_si_pid->pid_mtx);
+}
+
+void si_pid_set(struct super_block *sb)
+{
+	pid_t bit;
+	int idx;
+	unsigned long *bitmap;
+	struct au_si_pid *au_si_pid;
+
+	si_pid_idx_bit(&idx, &bit);
+	au_si_pid = &au_sbi(sb)->au_si_pid;
+	bitmap = au_si_pid->pid_bitmap[idx];
+	if (!bitmap) {
+		si_pid_alloc(au_si_pid, idx);
+		bitmap = au_si_pid->pid_bitmap[idx];
+	}
+	AuDebugOn(test_bit(bit, bitmap));
+	set_bit(bit, bitmap);
+	/* smp_mb(); */
+}
diff --git b/fs/aufs/spl.h b/fs/aufs/spl.h
new file mode 100644
index 0000000..f9b5288
--- /dev/null
+++ b/fs/aufs/spl.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * simple list protected by a spinlock
+ */
+
+#ifndef __AUFS_SPL_H__
+#define __AUFS_SPL_H__
+
+#ifdef __KERNEL__
+
+struct au_splhead {
+	spinlock_t		spin;
+	struct list_head	head;
+};
+
+static inline void au_spl_init(struct au_splhead *spl)
+{
+	spin_lock_init(&spl->spin);
+	INIT_LIST_HEAD(&spl->head);
+}
+
+static inline void au_spl_add(struct list_head *list, struct au_splhead *spl)
+{
+	spin_lock(&spl->spin);
+	list_add(list, &spl->head);
+	spin_unlock(&spl->spin);
+}
+
+static inline void au_spl_del(struct list_head *list, struct au_splhead *spl)
+{
+	spin_lock(&spl->spin);
+	list_del(list);
+	spin_unlock(&spl->spin);
+}
+
+static inline void au_spl_del_rcu(struct list_head *list,
+				  struct au_splhead *spl)
+{
+	spin_lock(&spl->spin);
+	list_del_rcu(list);
+	spin_unlock(&spl->spin);
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_sphlhead {
+	spinlock_t		spin;
+	struct hlist_head	head;
+};
+
+static inline void au_sphl_init(struct au_sphlhead *sphl)
+{
+	spin_lock_init(&sphl->spin);
+	INIT_HLIST_HEAD(&sphl->head);
+}
+
+static inline void au_sphl_add(struct hlist_node *hlist,
+			       struct au_sphlhead *sphl)
+{
+	spin_lock(&sphl->spin);
+	hlist_add_head(hlist, &sphl->head);
+	spin_unlock(&sphl->spin);
+}
+
+static inline void au_sphl_del(struct hlist_node *hlist,
+			       struct au_sphlhead *sphl)
+{
+	spin_lock(&sphl->spin);
+	hlist_del(hlist);
+	spin_unlock(&sphl->spin);
+}
+
+static inline void au_sphl_del_rcu(struct hlist_node *hlist,
+				   struct au_sphlhead *sphl)
+{
+	spin_lock(&sphl->spin);
+	hlist_del_rcu(hlist);
+	spin_unlock(&sphl->spin);
+}
+
+static inline unsigned long au_sphl_count(struct au_sphlhead *sphl)
+{
+	unsigned long cnt;
+	struct hlist_node *pos;
+
+	cnt = 0;
+	spin_lock(&sphl->spin);
+	hlist_for_each(pos, &sphl->head)
+		cnt++;
+	spin_unlock(&sphl->spin);
+	return cnt;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_SPL_H__ */
diff --git b/fs/aufs/super.c b/fs/aufs/super.c
new file mode 100644
index 0000000..7928a50
--- /dev/null
+++ b/fs/aufs/super.c
@@ -0,0 +1,1026 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * mount and super_block operations
+ */
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/vmalloc.h>
+#include "aufs.h"
+
+/*
+ * super_operations
+ */
+static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused)
+{
+	struct au_icntnr *c;
+
+	c = au_cache_alloc_icntnr();
+	if (c) {
+		au_icntnr_init(c);
+		c->vfs_inode.i_version = 1; /* sigen(sb); */
+		c->iinfo.ii_hinode = NULL;
+		return &c->vfs_inode;
+	}
+	return NULL;
+}
+
+static void aufs_destroy_inode_cb(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	INIT_HLIST_HEAD(&inode->i_dentry);
+	au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode));
+}
+
+static void aufs_destroy_inode(struct inode *inode)
+{
+	au_iinfo_fin(inode);
+	call_rcu(&inode->i_rcu, aufs_destroy_inode_cb);
+}
+
+struct inode *au_iget_locked(struct super_block *sb, ino_t ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = iget_locked(sb, ino);
+	if (unlikely(!inode)) {
+		inode = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	if (!(inode->i_state & I_NEW))
+		goto out;
+
+	err = au_xigen_new(inode);
+	if (!err)
+		err = au_iinfo_init(inode);
+	if (!err)
+		inode->i_version++;
+	else {
+		iget_failed(inode);
+		inode = ERR_PTR(err);
+	}
+
+out:
+	/* never return NULL */
+	AuDebugOn(!inode);
+	AuTraceErrPtr(inode);
+	return inode;
+}
+
+/* lock free root dinfo */
+static int au_show_brs(struct seq_file *seq, struct super_block *sb)
+{
+	int err;
+	aufs_bindex_t bindex, bend;
+	struct path path;
+	struct au_hdentry *hdp;
+	struct au_branch *br;
+	au_br_perm_str_t perm;
+
+	err = 0;
+	bend = au_sbend(sb);
+	hdp = au_di(sb->s_root)->di_hdentry;
+	for (bindex = 0; !err && bindex <= bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		path.mnt = au_br_mnt(br);
+		path.dentry = hdp[bindex].hd_dentry;
+		err = au_seq_path(seq, &path);
+		if (!err) {
+			au_optstr_br_perm(&perm, br->br_perm);
+			seq_printf(seq, "=%s", perm.a);
+			if (bindex != bend)
+				seq_putc(seq, ':');
+		}
+	}
+	if (unlikely(err || seq_has_overflowed(seq)))
+		err = -E2BIG;
+
+	return err;
+}
+
+static void au_show_wbr_create(struct seq_file *m, int v,
+			       struct au_sbinfo *sbinfo)
+{
+	const char *pat;
+
+	AuRwMustAnyLock(&sbinfo->si_rwsem);
+
+	seq_puts(m, ",create=");
+	pat = au_optstr_wbr_create(v);
+	switch (v) {
+	case AuWbrCreate_TDP:
+	case AuWbrCreate_RR:
+	case AuWbrCreate_MFS:
+	case AuWbrCreate_PMFS:
+		seq_puts(m, pat);
+		break;
+	case AuWbrCreate_MFSV:
+		seq_printf(m, /*pat*/"mfs:%lu",
+			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
+			   / MSEC_PER_SEC);
+		break;
+	case AuWbrCreate_PMFSV:
+		seq_printf(m, /*pat*/"pmfs:%lu",
+			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
+			   / MSEC_PER_SEC);
+		break;
+	case AuWbrCreate_MFSRR:
+		seq_printf(m, /*pat*/"mfsrr:%llu",
+			   sbinfo->si_wbr_mfs.mfsrr_watermark);
+		break;
+	case AuWbrCreate_MFSRRV:
+		seq_printf(m, /*pat*/"mfsrr:%llu:%lu",
+			   sbinfo->si_wbr_mfs.mfsrr_watermark,
+			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
+			   / MSEC_PER_SEC);
+		break;
+	case AuWbrCreate_PMFSRR:
+		seq_printf(m, /*pat*/"pmfsrr:%llu",
+			   sbinfo->si_wbr_mfs.mfsrr_watermark);
+		break;
+	case AuWbrCreate_PMFSRRV:
+		seq_printf(m, /*pat*/"pmfsrr:%llu:%lu",
+			   sbinfo->si_wbr_mfs.mfsrr_watermark,
+			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
+			   / MSEC_PER_SEC);
+		break;
+	}
+}
+
+static int au_show_xino(struct seq_file *seq, struct super_block *sb)
+{
+#ifdef CONFIG_SYSFS
+	return 0;
+#else
+	int err;
+	const int len = sizeof(AUFS_XINO_FNAME) - 1;
+	aufs_bindex_t bindex, brid;
+	struct qstr *name;
+	struct file *f;
+	struct dentry *d, *h_root;
+	struct au_hdentry *hdp;
+
+	AuRwMustAnyLock(&sbinfo->si_rwsem);
+
+	err = 0;
+	f = au_sbi(sb)->si_xib;
+	if (!f)
+		goto out;
+
+	/* stop printing the default xino path on the first writable branch */
+	h_root = NULL;
+	brid = au_xino_brid(sb);
+	if (brid >= 0) {
+		bindex = au_br_index(sb, brid);
+		hdp = au_di(sb->s_root)->di_hdentry;
+		h_root = hdp[0 + bindex].hd_dentry;
+	}
+	d = f->f_path.dentry;
+	name = &d->d_name;
+	/* safe ->d_parent because the file is unlinked */
+	if (d->d_parent == h_root
+	    && name->len == len
+	    && !memcmp(name->name, AUFS_XINO_FNAME, len))
+		goto out;
+
+	seq_puts(seq, ",xino=");
+	err = au_xino_path(seq, f);
+
+out:
+	return err;
+#endif
+}
+
+/* seq_file will re-call me in case of too long string */
+static int aufs_show_options(struct seq_file *m, struct dentry *dentry)
+{
+	int err;
+	unsigned int mnt_flags, v;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+
+#define AuBool(name, str) do { \
+	v = au_opt_test(mnt_flags, name); \
+	if (v != au_opt_test(AuOpt_Def, name)) \
+		seq_printf(m, ",%s" #str, v ? "" : "no"); \
+} while (0)
+
+#define AuStr(name, str) do { \
+	v = mnt_flags & AuOptMask_##name; \
+	if (v != (AuOpt_Def & AuOptMask_##name)) \
+		seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \
+} while (0)
+
+#define AuUInt(name, str, val) do { \
+	if (val != AUFS_##name##_DEF) \
+		seq_printf(m, "," #str "=%u", val); \
+} while (0)
+
+	sb = dentry->d_sb;
+	if (sb->s_flags & MS_POSIXACL)
+		seq_puts(m, ",acl");
+
+	/* lock free root dinfo */
+	si_noflush_read_lock(sb);
+	sbinfo = au_sbi(sb);
+	seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo));
+
+	mnt_flags = au_mntflags(sb);
+	if (au_opt_test(mnt_flags, XINO)) {
+		err = au_show_xino(m, sb);
+		if (unlikely(err))
+			goto out;
+	} else
+		seq_puts(m, ",noxino");
+
+	AuBool(TRUNC_XINO, trunc_xino);
+	AuStr(UDBA, udba);
+	AuBool(SHWH, shwh);
+	AuBool(PLINK, plink);
+	AuBool(DIO, dio);
+	AuBool(DIRPERM1, dirperm1);
+
+	v = sbinfo->si_wbr_create;
+	if (v != AuWbrCreate_Def)
+		au_show_wbr_create(m, v, sbinfo);
+
+	v = sbinfo->si_wbr_copyup;
+	if (v != AuWbrCopyup_Def)
+		seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v));
+
+	v = au_opt_test(mnt_flags, ALWAYS_DIROPQ);
+	if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ))
+		seq_printf(m, ",diropq=%c", v ? 'a' : 'w');
+
+	AuUInt(DIRWH, dirwh, sbinfo->si_dirwh);
+
+	v = jiffies_to_msecs(sbinfo->si_rdcache) / MSEC_PER_SEC;
+	AuUInt(RDCACHE, rdcache, v);
+
+	AuUInt(RDBLK, rdblk, sbinfo->si_rdblk);
+	AuUInt(RDHASH, rdhash, sbinfo->si_rdhash);
+
+	au_fhsm_show(m, sbinfo);
+
+	AuBool(SUM, sum);
+	/* AuBool(SUM_W, wsum); */
+	AuBool(WARN_PERM, warn_perm);
+	AuBool(VERBOSE, verbose);
+
+out:
+	/* be sure to print "br:" last */
+	if (!sysaufs_brs) {
+		seq_puts(m, ",br:");
+		au_show_brs(m, sb);
+	}
+	si_read_unlock(sb);
+	return 0;
+
+#undef AuBool
+#undef AuStr
+#undef AuUInt
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* sum mode which returns the summation for statfs(2) */
+
+static u64 au_add_till_max(u64 a, u64 b)
+{
+	u64 old;
+
+	old = a;
+	a += b;
+	if (old <= a)
+		return a;
+	return ULLONG_MAX;
+}
+
+static u64 au_mul_till_max(u64 a, long mul)
+{
+	u64 old;
+
+	old = a;
+	a *= mul;
+	if (old <= a)
+		return a;
+	return ULLONG_MAX;
+}
+
+static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf)
+{
+	int err;
+	long bsize, factor;
+	u64 blocks, bfree, bavail, files, ffree;
+	aufs_bindex_t bend, bindex, i;
+	unsigned char shared;
+	struct path h_path;
+	struct super_block *h_sb;
+
+	err = 0;
+	bsize = LONG_MAX;
+	files = 0;
+	ffree = 0;
+	blocks = 0;
+	bfree = 0;
+	bavail = 0;
+	bend = au_sbend(sb);
+	for (bindex = 0; bindex <= bend; bindex++) {
+		h_path.mnt = au_sbr_mnt(sb, bindex);
+		h_sb = h_path.mnt->mnt_sb;
+		shared = 0;
+		for (i = 0; !shared && i < bindex; i++)
+			shared = (au_sbr_sb(sb, i) == h_sb);
+		if (shared)
+			continue;
+
+		/* sb->s_root for NFS is unreliable */
+		h_path.dentry = h_path.mnt->mnt_root;
+		err = vfs_statfs(&h_path, buf);
+		if (unlikely(err))
+			goto out;
+
+		if (bsize > buf->f_bsize) {
+			/*
+			 * we will reduce bsize, so we have to expand blocks
+			 * etc. to match them again
+			 */
+			factor = (bsize / buf->f_bsize);
+			blocks = au_mul_till_max(blocks, factor);
+			bfree = au_mul_till_max(bfree, factor);
+			bavail = au_mul_till_max(bavail, factor);
+			bsize = buf->f_bsize;
+		}
+
+		factor = (buf->f_bsize / bsize);
+		blocks = au_add_till_max(blocks,
+				au_mul_till_max(buf->f_blocks, factor));
+		bfree = au_add_till_max(bfree,
+				au_mul_till_max(buf->f_bfree, factor));
+		bavail = au_add_till_max(bavail,
+				au_mul_till_max(buf->f_bavail, factor));
+		files = au_add_till_max(files, buf->f_files);
+		ffree = au_add_till_max(ffree, buf->f_ffree);
+	}
+
+	buf->f_bsize = bsize;
+	buf->f_blocks = blocks;
+	buf->f_bfree = bfree;
+	buf->f_bavail = bavail;
+	buf->f_files = files;
+	buf->f_ffree = ffree;
+	buf->f_frsize = 0;
+
+out:
+	return err;
+}
+
+static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	int err;
+	struct path h_path;
+	struct super_block *sb;
+
+	/* lock free root dinfo */
+	sb = dentry->d_sb;
+	si_noflush_read_lock(sb);
+	if (!au_opt_test(au_mntflags(sb), SUM)) {
+		/* sb->s_root for NFS is unreliable */
+		h_path.mnt = au_sbr_mnt(sb, 0);
+		h_path.dentry = h_path.mnt->mnt_root;
+		err = vfs_statfs(&h_path, buf);
+	} else
+		err = au_statfs_sum(sb, buf);
+	si_read_unlock(sb);
+
+	if (!err) {
+		buf->f_type = AUFS_SUPER_MAGIC;
+		buf->f_namelen = AUFS_MAX_NAMELEN;
+		memset(&buf->f_fsid, 0, sizeof(buf->f_fsid));
+	}
+	/* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_sync_fs(struct super_block *sb, int wait)
+{
+	int err, e;
+	aufs_bindex_t bend, bindex;
+	struct au_branch *br;
+	struct super_block *h_sb;
+
+	err = 0;
+	si_noflush_read_lock(sb);
+	bend = au_sbend(sb);
+	for (bindex = 0; bindex <= bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		if (!au_br_writable(br->br_perm))
+			continue;
+
+		h_sb = au_sbr_sb(sb, bindex);
+		if (h_sb->s_op->sync_fs) {
+			e = h_sb->s_op->sync_fs(h_sb, wait);
+			if (unlikely(e && !err))
+				err = e;
+			/* go on even if an error happens */
+		}
+	}
+	si_read_unlock(sb);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* final actions when unmounting a file system */
+static void aufs_put_super(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+
+	sbinfo = au_sbi(sb);
+	if (!sbinfo)
+		return;
+
+	dbgaufs_si_fin(sbinfo);
+	kobject_put(&sbinfo->si_kobj);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb,
+		     struct super_block *sb, void *arg)
+{
+	void *array;
+	unsigned long long n, sz;
+
+	array = NULL;
+	n = 0;
+	if (!*hint)
+		goto out;
+
+	if (*hint > ULLONG_MAX / sizeof(array)) {
+		array = ERR_PTR(-EMFILE);
+		pr_err("hint %llu\n", *hint);
+		goto out;
+	}
+
+	sz = sizeof(array) * *hint;
+	array = kzalloc(sz, GFP_NOFS);
+	if (unlikely(!array))
+		array = vzalloc(sz);
+	if (unlikely(!array)) {
+		array = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	n = cb(sb, array, *hint, arg);
+	AuDebugOn(n > *hint);
+
+out:
+	*hint = n;
+	return array;
+}
+
+static unsigned long long au_iarray_cb(struct super_block *sb, void *a,
+				       unsigned long long max __maybe_unused,
+				       void *arg)
+{
+	unsigned long long n;
+	struct inode **p, *inode;
+	struct list_head *head;
+
+	n = 0;
+	p = a;
+	head = arg;
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, head, i_sb_list) {
+		if (!is_bad_inode(inode)
+		    && au_ii(inode)->ii_bstart >= 0) {
+			spin_lock(&inode->i_lock);
+			if (atomic_read(&inode->i_count)) {
+				au_igrab(inode);
+				*p++ = inode;
+				n++;
+				AuDebugOn(n > max);
+			}
+			spin_unlock(&inode->i_lock);
+		}
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+
+	return n;
+}
+
+struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max)
+{
+	*max = atomic_long_read(&au_sbi(sb)->si_ninodes);
+	return au_array_alloc(max, au_iarray_cb, sb, &sb->s_inodes);
+}
+
+void au_iarray_free(struct inode **a, unsigned long long max)
+{
+	unsigned long long ull;
+
+	for (ull = 0; ull < max; ull++)
+		iput(a[ull]);
+	kvfree(a);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * refresh dentry and inode at remount time.
+ */
+/* todo: consolidate with simple_reval_dpath() and au_reval_for_attr() */
+static int au_do_refresh(struct dentry *dentry, unsigned int dir_flags,
+		      struct dentry *parent)
+{
+	int err;
+
+	di_write_lock_child(dentry);
+	di_read_lock_parent(parent, AuLock_IR);
+	err = au_refresh_dentry(dentry, parent);
+	if (!err && dir_flags)
+		au_hn_reset(d_inode(dentry), dir_flags);
+	di_read_unlock(parent, AuLock_IR);
+	di_write_unlock(dentry);
+
+	return err;
+}
+
+static int au_do_refresh_d(struct dentry *dentry, unsigned int sigen,
+			   struct au_sbinfo *sbinfo,
+			   const unsigned int dir_flags, unsigned int do_idop)
+{
+	int err;
+	struct dentry *parent;
+
+	err = 0;
+	parent = dget_parent(dentry);
+	if (!au_digen_test(parent, sigen) && au_digen_test(dentry, sigen)) {
+		if (d_really_is_positive(dentry)) {
+			if (!d_is_dir(dentry))
+				err = au_do_refresh(dentry, /*dir_flags*/0,
+						 parent);
+			else {
+				err = au_do_refresh(dentry, dir_flags, parent);
+				if (unlikely(err))
+					au_fset_si(sbinfo, FAILED_REFRESH_DIR);
+			}
+		} else
+			err = au_do_refresh(dentry, /*dir_flags*/0, parent);
+		AuDbgDentry(dentry);
+	}
+	dput(parent);
+
+	if (!err) {
+		if (do_idop)
+			au_refresh_dop(dentry, /*force_reval*/0);
+	} else
+		au_refresh_dop(dentry, /*force_reval*/1);
+
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_refresh_d(struct super_block *sb, unsigned int do_idop)
+{
+	int err, i, j, ndentry, e;
+	unsigned int sigen;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry **dentries, *d;
+	struct au_sbinfo *sbinfo;
+	struct dentry *root = sb->s_root;
+	const unsigned int dir_flags = au_hi_flags(d_inode(root), /*isdir*/1);
+
+	if (do_idop)
+		au_refresh_dop(root, /*force_reval*/0);
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	err = au_dcsub_pages(&dpages, root, NULL, NULL);
+	if (unlikely(err))
+		goto out_dpages;
+
+	sigen = au_sigen(sb);
+	sbinfo = au_sbi(sb);
+	for (i = 0; i < dpages.ndpage; i++) {
+		dpage = dpages.dpages + i;
+		dentries = dpage->dentries;
+		ndentry = dpage->ndentry;
+		for (j = 0; j < ndentry; j++) {
+			d = dentries[j];
+			e = au_do_refresh_d(d, sigen, sbinfo, dir_flags,
+					    do_idop);
+			if (unlikely(e && !err))
+				err = e;
+			/* go on even err */
+		}
+	}
+
+out_dpages:
+	au_dpages_free(&dpages);
+out:
+	return err;
+}
+
+static int au_refresh_i(struct super_block *sb, unsigned int do_idop)
+{
+	int err, e;
+	unsigned int sigen;
+	unsigned long long max, ull;
+	struct inode *inode, **array;
+
+	array = au_iarray_alloc(sb, &max);
+	err = PTR_ERR(array);
+	if (IS_ERR(array))
+		goto out;
+
+	err = 0;
+	sigen = au_sigen(sb);
+	for (ull = 0; ull < max; ull++) {
+		inode = array[ull];
+		if (unlikely(!inode))
+			break;
+
+		e = 0;
+		ii_write_lock_child(inode);
+		if (au_iigen(inode, NULL) != sigen) {
+			e = au_refresh_hinode_self(inode);
+			if (unlikely(e)) {
+				au_refresh_iop(inode, /*force_getattr*/1);
+				pr_err("error %d, i%lu\n", e, inode->i_ino);
+				if (!err)
+					err = e;
+				/* go on even if err */
+			}
+		}
+		if (!e && do_idop)
+			au_refresh_iop(inode, /*force_getattr*/0);
+		ii_write_unlock(inode);
+	}
+
+	au_iarray_free(array, max);
+
+out:
+	return err;
+}
+
+static void au_remount_refresh(struct super_block *sb, unsigned int do_idop)
+{
+	int err, e;
+	unsigned int udba;
+	aufs_bindex_t bindex, bend;
+	struct dentry *root;
+	struct inode *inode;
+	struct au_branch *br;
+	struct au_sbinfo *sbi;
+
+	au_sigen_inc(sb);
+	sbi = au_sbi(sb);
+	au_fclr_si(sbi, FAILED_REFRESH_DIR);
+
+	root = sb->s_root;
+	DiMustNoWaiters(root);
+	inode = d_inode(root);
+	IiMustNoWaiters(inode);
+
+	udba = au_opt_udba(sb);
+	bend = au_sbend(sb);
+	for (bindex = 0; bindex <= bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		err = au_hnotify_reset_br(udba, br, br->br_perm);
+		if (unlikely(err))
+			AuIOErr("hnotify failed on br %d, %d, ignored\n",
+				bindex, err);
+		/* go on even if err */
+	}
+	au_hn_reset(inode, au_hi_flags(inode, /*isdir*/1));
+
+	if (do_idop) {
+		if (au_ftest_si(sbi, NO_DREVAL)) {
+			AuDebugOn(sb->s_d_op == &aufs_dop_noreval);
+			sb->s_d_op = &aufs_dop_noreval;
+			AuDebugOn(sbi->si_iop_array == aufs_iop_nogetattr);
+			sbi->si_iop_array = aufs_iop_nogetattr;
+		} else {
+			AuDebugOn(sb->s_d_op == &aufs_dop);
+			sb->s_d_op = &aufs_dop;
+			AuDebugOn(sbi->si_iop_array == aufs_iop);
+			sbi->si_iop_array = aufs_iop;
+		}
+		pr_info("reset to %pf and %pf\n",
+			sb->s_d_op, sbi->si_iop_array);
+	}
+
+	di_write_unlock(root);
+	err = au_refresh_d(sb, do_idop);
+	e = au_refresh_i(sb, do_idop);
+	if (unlikely(e && !err))
+		err = e;
+	/* aufs_write_lock() calls ..._child() */
+	di_write_lock_child(root);
+
+	au_cpup_attr_all(inode, /*force*/1);
+
+	if (unlikely(err))
+		AuIOErr("refresh failed, ignored, %d\n", err);
+}
+
+/* stop extra interpretation of errno in mount(8), and strange error messages */
+static int cvt_err(int err)
+{
+	AuTraceErr(err);
+
+	switch (err) {
+	case -ENOENT:
+	case -ENOTDIR:
+	case -EEXIST:
+	case -EIO:
+		err = -EINVAL;
+	}
+	return err;
+}
+
+static int aufs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	int err, do_dx;
+	unsigned int mntflags;
+	struct au_opts opts = {
+		.opt = NULL
+	};
+	struct dentry *root;
+	struct inode *inode;
+	struct au_sbinfo *sbinfo;
+
+	err = 0;
+	root = sb->s_root;
+	if (!data || !*data) {
+		err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+		if (!err) {
+			di_write_lock_child(root);
+			err = au_opts_verify(sb, *flags, /*pending*/0);
+			aufs_write_unlock(root);
+		}
+		goto out;
+	}
+
+	err = -ENOMEM;
+	opts.opt = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!opts.opt))
+		goto out;
+	opts.max_opt = PAGE_SIZE / sizeof(*opts.opt);
+	opts.flags = AuOpts_REMOUNT;
+	opts.sb_flags = *flags;
+
+	/* parse it before aufs lock */
+	err = au_opts_parse(sb, data, &opts);
+	if (unlikely(err))
+		goto out_opts;
+
+	sbinfo = au_sbi(sb);
+	inode = d_inode(root);
+	inode_lock(inode);
+	err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out_mtx;
+	di_write_lock_child(root);
+
+	/* au_opts_remount() may return an error */
+	err = au_opts_remount(sb, &opts);
+	au_opts_free(&opts);
+
+	if (au_ftest_opts(opts.flags, REFRESH))
+		au_remount_refresh(sb, au_ftest_opts(opts.flags, REFRESH_IDOP));
+
+	if (au_ftest_opts(opts.flags, REFRESH_DYAOP)) {
+		mntflags = au_mntflags(sb);
+		do_dx = !!au_opt_test(mntflags, DIO);
+		au_dy_arefresh(do_dx);
+	}
+
+	au_fhsm_wrote_all(sb, /*force*/1); /* ?? */
+	aufs_write_unlock(root);
+
+out_mtx:
+	inode_unlock(inode);
+out_opts:
+	free_page((unsigned long)opts.opt);
+out:
+	err = cvt_err(err);
+	AuTraceErr(err);
+	return err;
+}
+
+static const struct super_operations aufs_sop = {
+	.alloc_inode	= aufs_alloc_inode,
+	.destroy_inode	= aufs_destroy_inode,
+	/* always deleting, no clearing */
+	.drop_inode	= generic_delete_inode,
+	.show_options	= aufs_show_options,
+	.statfs		= aufs_statfs,
+	.put_super	= aufs_put_super,
+	.sync_fs	= aufs_sync_fs,
+	.remount_fs	= aufs_remount_fs
+};
+
+/* ---------------------------------------------------------------------- */
+
+static int alloc_root(struct super_block *sb)
+{
+	int err;
+	struct inode *inode;
+	struct dentry *root;
+
+	err = -ENOMEM;
+	inode = au_iget_locked(sb, AUFS_ROOT_INO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	inode->i_op = aufs_iop + AuIop_DIR; /* with getattr by default */
+	inode->i_fop = &aufs_dir_fop;
+	inode->i_mode = S_IFDIR;
+	set_nlink(inode, 2);
+	unlock_new_inode(inode);
+
+	root = d_make_root(inode);
+	if (unlikely(!root))
+		goto out;
+	err = PTR_ERR(root);
+	if (IS_ERR(root))
+		goto out;
+
+	err = au_di_init(root);
+	if (!err) {
+		sb->s_root = root;
+		return 0; /* success */
+	}
+	dput(root);
+
+out:
+	return err;
+}
+
+static int aufs_fill_super(struct super_block *sb, void *raw_data,
+			   int silent __maybe_unused)
+{
+	int err;
+	struct au_opts opts = {
+		.opt = NULL
+	};
+	struct au_sbinfo *sbinfo;
+	struct dentry *root;
+	struct inode *inode;
+	char *arg = raw_data;
+
+	if (unlikely(!arg || !*arg)) {
+		err = -EINVAL;
+		pr_err("no arg\n");
+		goto out;
+	}
+
+	err = -ENOMEM;
+	opts.opt = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!opts.opt))
+		goto out;
+	opts.max_opt = PAGE_SIZE / sizeof(*opts.opt);
+	opts.sb_flags = sb->s_flags;
+
+	err = au_si_alloc(sb);
+	if (unlikely(err))
+		goto out_opts;
+	sbinfo = au_sbi(sb);
+
+	/* all timestamps always follow the ones on the branch */
+	sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
+	sb->s_op = &aufs_sop;
+	sb->s_d_op = &aufs_dop;
+	sb->s_magic = AUFS_SUPER_MAGIC;
+	sb->s_maxbytes = 0;
+	sb->s_stack_depth = 1;
+	au_export_init(sb);
+	/* au_xattr_init(sb); */
+
+	err = alloc_root(sb);
+	if (unlikely(err)) {
+		si_write_unlock(sb);
+		goto out_info;
+	}
+	root = sb->s_root;
+	inode = d_inode(root);
+
+	/*
+	 * actually we can parse options regardless aufs lock here.
+	 * but at remount time, parsing must be done before aufs lock.
+	 * so we follow the same rule.
+	 */
+	ii_write_lock_parent(inode);
+	aufs_write_unlock(root);
+	err = au_opts_parse(sb, arg, &opts);
+	if (unlikely(err))
+		goto out_root;
+
+	/* lock vfs_inode first, then aufs. */
+	inode_lock(inode);
+	aufs_write_lock(root);
+	err = au_opts_mount(sb, &opts);
+	au_opts_free(&opts);
+	if (!err && au_ftest_si(sbinfo, NO_DREVAL)) {
+		sb->s_d_op = &aufs_dop_noreval;
+		pr_info("%pf\n", sb->s_d_op);
+		au_refresh_dop(root, /*force_reval*/0);
+		sbinfo->si_iop_array = aufs_iop_nogetattr;
+		au_refresh_iop(inode, /*force_getattr*/0);
+	}
+	aufs_write_unlock(root);
+	inode_unlock(inode);
+	if (!err)
+		goto out_opts; /* success */
+
+out_root:
+	dput(root);
+	sb->s_root = NULL;
+out_info:
+	dbgaufs_si_fin(sbinfo);
+	kobject_put(&sbinfo->si_kobj);
+	sb->s_fs_info = NULL;
+out_opts:
+	free_page((unsigned long)opts.opt);
+out:
+	AuTraceErr(err);
+	err = cvt_err(err);
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct dentry *aufs_mount(struct file_system_type *fs_type, int flags,
+				 const char *dev_name __maybe_unused,
+				 void *raw_data)
+{
+	struct dentry *root;
+	struct super_block *sb;
+
+	/* all timestamps always follow the ones on the branch */
+	/* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */
+	root = mount_nodev(fs_type, flags, raw_data, aufs_fill_super);
+	if (IS_ERR(root))
+		goto out;
+
+	sb = root->d_sb;
+	si_write_lock(sb, !AuLock_FLUSH);
+	sysaufs_brs_add(sb, 0);
+	si_write_unlock(sb);
+	au_sbilist_add(sb);
+
+out:
+	return root;
+}
+
+static void aufs_kill_sb(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+
+	sbinfo = au_sbi(sb);
+	if (sbinfo) {
+		au_sbilist_del(sb);
+		aufs_write_lock(sb->s_root);
+		au_fhsm_fin(sb);
+		if (sbinfo->si_wbr_create_ops->fin)
+			sbinfo->si_wbr_create_ops->fin(sb);
+		if (au_opt_test(sbinfo->si_mntflags, UDBA_HNOTIFY)) {
+			au_opt_set_udba(sbinfo->si_mntflags, UDBA_NONE);
+			au_remount_refresh(sb, /*do_idop*/0);
+		}
+		if (au_opt_test(sbinfo->si_mntflags, PLINK))
+			au_plink_put(sb, /*verbose*/1);
+		au_xino_clr(sb);
+		sbinfo->si_sb = NULL;
+		aufs_write_unlock(sb->s_root);
+		au_nwt_flush(&sbinfo->si_nowait);
+	}
+	kill_anon_super(sb);
+}
+
+struct file_system_type aufs_fs_type = {
+	.name		= AUFS_FSTYPE,
+	/* a race between rename and others */
+	.fs_flags	= FS_RENAME_DOES_D_MOVE,
+	.mount		= aufs_mount,
+	.kill_sb	= aufs_kill_sb,
+	/* no need to __module_get() and module_put(). */
+	.owner		= THIS_MODULE,
+};
diff --git b/fs/aufs/super.h b/fs/aufs/super.h
new file mode 100644
index 0000000..23d03a0
--- /dev/null
+++ b/fs/aufs/super.h
@@ -0,0 +1,611 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * super_block operations
+ */
+
+#ifndef __AUFS_SUPER_H__
+#define __AUFS_SUPER_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include "rwsem.h"
+#include "spl.h"
+#include "wkq.h"
+
+/* policies to select one among multiple writable branches */
+struct au_wbr_copyup_operations {
+	int (*copyup)(struct dentry *dentry);
+};
+
+#define AuWbr_DIR	1		/* target is a dir */
+#define AuWbr_PARENT	(1 << 1)	/* always require a parent */
+
+#define au_ftest_wbr(flags, name)	((flags) & AuWbr_##name)
+#define au_fset_wbr(flags, name)	{ (flags) |= AuWbr_##name; }
+#define au_fclr_wbr(flags, name)	{ (flags) &= ~AuWbr_##name; }
+
+struct au_wbr_create_operations {
+	int (*create)(struct dentry *dentry, unsigned int flags);
+	int (*init)(struct super_block *sb);
+	int (*fin)(struct super_block *sb);
+};
+
+struct au_wbr_mfs {
+	struct mutex	mfs_lock; /* protect this structure */
+	unsigned long	mfs_jiffy;
+	unsigned long	mfs_expire;
+	aufs_bindex_t	mfs_bindex;
+
+	unsigned long long	mfsrr_bytes;
+	unsigned long long	mfsrr_watermark;
+};
+
+#define AuPlink_NHASH 100
+static inline int au_plink_hash(ino_t ino)
+{
+	return ino % AuPlink_NHASH;
+}
+
+/* File-based Hierarchical Storage Management */
+struct au_fhsm {
+#ifdef CONFIG_AUFS_FHSM
+	/* allow only one process who can receive the notification */
+	spinlock_t		fhsm_spin;
+	pid_t			fhsm_pid;
+	wait_queue_head_t	fhsm_wqh;
+	atomic_t		fhsm_readable;
+
+	/* these are protected by si_rwsem */
+	unsigned long		fhsm_expire;
+	aufs_bindex_t		fhsm_bottom;
+#endif
+};
+
+#define AU_PIDSTEP	(int)(BITS_TO_LONGS(PID_MAX_DEFAULT) * BITS_PER_LONG)
+#define AU_NPIDMAP	(int)DIV_ROUND_UP(PID_MAX_LIMIT, AU_PIDSTEP)
+struct au_si_pid {
+	unsigned long	*pid_bitmap[AU_NPIDMAP];
+	struct mutex	pid_mtx;
+};
+
+struct au_branch;
+struct au_sbinfo {
+	/* nowait tasks in the system-wide workqueue */
+	struct au_nowait_tasks	si_nowait;
+
+	/*
+	 * tried sb->s_umount, but failed due to the dependecy between i_mutex.
+	 * rwsem for au_sbinfo is necessary.
+	 */
+	struct au_rwsem		si_rwsem;
+
+	/* prevent recursive locking in deleting inode */
+	struct au_si_pid	au_si_pid;
+
+	/*
+	 * dirty approach to protect sb->sb_inodes and ->s_files (gone) from
+	 * remount.
+	 */
+	atomic_long_t		si_ninodes, si_nfiles;
+
+	/* branch management */
+	unsigned int		si_generation;
+
+	/* see AuSi_ flags */
+	unsigned char		au_si_status;
+
+	aufs_bindex_t		si_bend;
+
+	/* dirty trick to keep br_id plus */
+	unsigned int		si_last_br_id :
+				sizeof(aufs_bindex_t) * BITS_PER_BYTE - 1;
+	struct au_branch	**si_branch;
+
+	/* policy to select a writable branch */
+	unsigned char		si_wbr_copyup;
+	unsigned char		si_wbr_create;
+	struct au_wbr_copyup_operations *si_wbr_copyup_ops;
+	struct au_wbr_create_operations *si_wbr_create_ops;
+
+	/* round robin */
+	atomic_t		si_wbr_rr_next;
+
+	/* most free space */
+	struct au_wbr_mfs	si_wbr_mfs;
+
+	/* File-based Hierarchical Storage Management */
+	struct au_fhsm		si_fhsm;
+
+	/* mount flags */
+	/* include/asm-ia64/siginfo.h defines a macro named si_flags */
+	unsigned int		si_mntflags;
+
+	/* external inode number (bitmap and translation table) */
+	vfs_readf_t		si_xread;
+	vfs_writef_t		si_xwrite;
+	struct file		*si_xib;
+	struct mutex		si_xib_mtx; /* protect xib members */
+	unsigned long		*si_xib_buf;
+	unsigned long		si_xib_last_pindex;
+	int			si_xib_next_bit;
+	aufs_bindex_t		si_xino_brid;
+	unsigned long		si_xino_jiffy;
+	unsigned long		si_xino_expire;
+	/* reserved for future use */
+	/* unsigned long long	si_xib_limit; */	/* Max xib file size */
+
+#ifdef CONFIG_AUFS_EXPORT
+	/* i_generation */
+	struct file		*si_xigen;
+	atomic_t		si_xigen_next;
+#endif
+
+	/* dirty trick to suppoer atomic_open */
+	struct au_sphlhead	si_aopen;
+
+	/* vdir parameters */
+	unsigned long		si_rdcache;	/* max cache time in jiffies */
+	unsigned int		si_rdblk;	/* deblk size */
+	unsigned int		si_rdhash;	/* hash size */
+
+	/*
+	 * If the number of whiteouts are larger than si_dirwh, leave all of
+	 * them after au_whtmp_ren to reduce the cost of rmdir(2).
+	 * future fsck.aufs or kernel thread will remove them later.
+	 * Otherwise, remove all whiteouts and the dir in rmdir(2).
+	 */
+	unsigned int		si_dirwh;
+
+	/* pseudo_link list */
+	struct au_sphlhead	si_plink[AuPlink_NHASH];
+	wait_queue_head_t	si_plink_wq;
+	spinlock_t		si_plink_maint_lock;
+	pid_t			si_plink_maint_pid;
+
+	/* file list */
+	struct au_sphlhead	si_files;
+
+	/* with/without getattr, brother of sb->s_d_op */
+	struct inode_operations *si_iop_array;
+
+	/*
+	 * sysfs and lifetime management.
+	 * this is not a small structure and it may be a waste of memory in case
+	 * of sysfs is disabled, particulary when many aufs-es are mounted.
+	 * but using sysfs is majority.
+	 */
+	struct kobject		si_kobj;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry		 *si_dbgaufs;
+	struct dentry		 *si_dbgaufs_plink;
+	struct dentry		 *si_dbgaufs_xib;
+#ifdef CONFIG_AUFS_EXPORT
+	struct dentry		 *si_dbgaufs_xigen;
+#endif
+#endif
+
+#ifdef CONFIG_AUFS_SBILIST
+	struct hlist_node	si_list;
+#endif
+
+	/* dirty, necessary for unmounting, sysfs and sysrq */
+	struct super_block	*si_sb;
+};
+
+/* sbinfo status flags */
+/*
+ * set true when refresh_dirs() failed at remount time.
+ * then try refreshing dirs at access time again.
+ * if it is false, refreshing dirs at access time is unnecesary
+ */
+#define AuSi_FAILED_REFRESH_DIR	1
+#define AuSi_FHSM		(1 << 1)	/* fhsm is active now */
+#define AuSi_NO_DREVAL		(1 << 2)	/* disable all d_revalidate */
+
+#ifndef CONFIG_AUFS_FHSM
+#undef AuSi_FHSM
+#define AuSi_FHSM		0
+#endif
+
+static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi,
+					   unsigned int flag)
+{
+	AuRwMustAnyLock(&sbi->si_rwsem);
+	return sbi->au_si_status & flag;
+}
+#define au_ftest_si(sbinfo, name)	au_do_ftest_si(sbinfo, AuSi_##name)
+#define au_fset_si(sbinfo, name) do { \
+	AuRwMustWriteLock(&(sbinfo)->si_rwsem); \
+	(sbinfo)->au_si_status |= AuSi_##name; \
+} while (0)
+#define au_fclr_si(sbinfo, name) do { \
+	AuRwMustWriteLock(&(sbinfo)->si_rwsem); \
+	(sbinfo)->au_si_status &= ~AuSi_##name; \
+} while (0)
+
+/* ---------------------------------------------------------------------- */
+
+/* policy to select one among writable branches */
+#define AuWbrCopyup(sbinfo, ...) \
+	((sbinfo)->si_wbr_copyup_ops->copyup(__VA_ARGS__))
+#define AuWbrCreate(sbinfo, ...) \
+	((sbinfo)->si_wbr_create_ops->create(__VA_ARGS__))
+
+/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */
+#define AuLock_DW		1		/* write-lock dentry */
+#define AuLock_IR		(1 << 1)	/* read-lock inode */
+#define AuLock_IW		(1 << 2)	/* write-lock inode */
+#define AuLock_FLUSH		(1 << 3)	/* wait for 'nowait' tasks */
+#define AuLock_DIRS		(1 << 4)	/* target is a pair of dirs */
+#define AuLock_NOPLM		(1 << 5)	/* return err in plm mode */
+#define AuLock_NOPLMW		(1 << 6)	/* wait for plm mode ends */
+#define AuLock_GEN		(1 << 7)	/* test digen/iigen */
+#define au_ftest_lock(flags, name)	((flags) & AuLock_##name)
+#define au_fset_lock(flags, name) \
+	do { (flags) |= AuLock_##name; } while (0)
+#define au_fclr_lock(flags, name) \
+	do { (flags) &= ~AuLock_##name; } while (0)
+
+/* ---------------------------------------------------------------------- */
+
+/* super.c */
+extern struct file_system_type aufs_fs_type;
+struct inode *au_iget_locked(struct super_block *sb, ino_t ino);
+typedef unsigned long long (*au_arraycb_t)(struct super_block *sb, void *array,
+					   unsigned long long max, void *arg);
+void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb,
+		     struct super_block *sb, void *arg);
+struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max);
+void au_iarray_free(struct inode **a, unsigned long long max);
+
+/* sbinfo.c */
+void au_si_free(struct kobject *kobj);
+int au_si_alloc(struct super_block *sb);
+int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr);
+
+unsigned int au_sigen_inc(struct super_block *sb);
+aufs_bindex_t au_new_br_id(struct super_block *sb);
+
+int si_read_lock(struct super_block *sb, int flags);
+int si_write_lock(struct super_block *sb, int flags);
+int aufs_read_lock(struct dentry *dentry, int flags);
+void aufs_read_unlock(struct dentry *dentry, int flags);
+void aufs_write_lock(struct dentry *dentry);
+void aufs_write_unlock(struct dentry *dentry);
+int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags);
+void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2);
+
+/* wbr_policy.c */
+extern struct au_wbr_copyup_operations au_wbr_copyup_ops[];
+extern struct au_wbr_create_operations au_wbr_create_ops[];
+int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst);
+int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex);
+int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t bstart);
+
+/* mvdown.c */
+int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *arg);
+
+#ifdef CONFIG_AUFS_FHSM
+/* fhsm.c */
+
+static inline pid_t au_fhsm_pid(struct au_fhsm *fhsm)
+{
+	pid_t pid;
+
+	spin_lock(&fhsm->fhsm_spin);
+	pid = fhsm->fhsm_pid;
+	spin_unlock(&fhsm->fhsm_spin);
+
+	return pid;
+}
+
+void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force);
+void au_fhsm_wrote_all(struct super_block *sb, int force);
+int au_fhsm_fd(struct super_block *sb, int oflags);
+int au_fhsm_br_alloc(struct au_branch *br);
+void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex);
+void au_fhsm_fin(struct super_block *sb);
+void au_fhsm_init(struct au_sbinfo *sbinfo);
+void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec);
+void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo);
+#else
+AuStubVoid(au_fhsm_wrote, struct super_block *sb, aufs_bindex_t bindex,
+	   int force)
+AuStubVoid(au_fhsm_wrote_all, struct super_block *sb, int force)
+AuStub(int, au_fhsm_fd, return -EOPNOTSUPP, struct super_block *sb, int oflags)
+AuStub(pid_t, au_fhsm_pid, return 0, struct au_fhsm *fhsm)
+AuStubInt0(au_fhsm_br_alloc, struct au_branch *br)
+AuStubVoid(au_fhsm_set_bottom, struct super_block *sb, aufs_bindex_t bindex)
+AuStubVoid(au_fhsm_fin, struct super_block *sb)
+AuStubVoid(au_fhsm_init, struct au_sbinfo *sbinfo)
+AuStubVoid(au_fhsm_set, struct au_sbinfo *sbinfo, unsigned int sec)
+AuStubVoid(au_fhsm_show, struct seq_file *seq, struct au_sbinfo *sbinfo)
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct au_sbinfo *au_sbi(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_EXPORT
+int au_test_nfsd(void);
+void au_export_init(struct super_block *sb);
+void au_xigen_inc(struct inode *inode);
+int au_xigen_new(struct inode *inode);
+int au_xigen_set(struct super_block *sb, struct file *base);
+void au_xigen_clr(struct super_block *sb);
+
+static inline int au_busy_or_stale(void)
+{
+	if (!au_test_nfsd())
+		return -EBUSY;
+	return -ESTALE;
+}
+#else
+AuStubInt0(au_test_nfsd, void)
+AuStubVoid(au_export_init, struct super_block *sb)
+AuStubVoid(au_xigen_inc, struct inode *inode)
+AuStubInt0(au_xigen_new, struct inode *inode)
+AuStubInt0(au_xigen_set, struct super_block *sb, struct file *base)
+AuStubVoid(au_xigen_clr, struct super_block *sb)
+AuStub(int, au_busy_or_stale, return -EBUSY, void)
+#endif /* CONFIG_AUFS_EXPORT */
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_SBILIST
+/* module.c */
+extern struct au_sphlhead au_sbilist;
+
+static inline void au_sbilist_init(void)
+{
+	au_sphl_init(&au_sbilist);
+}
+
+static inline void au_sbilist_add(struct super_block *sb)
+{
+	au_sphl_add(&au_sbi(sb)->si_list, &au_sbilist);
+}
+
+static inline void au_sbilist_del(struct super_block *sb)
+{
+	au_sphl_del(&au_sbi(sb)->si_list, &au_sbilist);
+}
+
+#ifdef CONFIG_AUFS_MAGIC_SYSRQ
+static inline void au_sbilist_lock(void)
+{
+	spin_lock(&au_sbilist.spin);
+}
+
+static inline void au_sbilist_unlock(void)
+{
+	spin_unlock(&au_sbilist.spin);
+}
+#define AuGFP_SBILIST	GFP_ATOMIC
+#else
+AuStubVoid(au_sbilist_lock, void)
+AuStubVoid(au_sbilist_unlock, void)
+#define AuGFP_SBILIST	GFP_NOFS
+#endif /* CONFIG_AUFS_MAGIC_SYSRQ */
+#else
+AuStubVoid(au_sbilist_init, void)
+AuStubVoid(au_sbilist_add, struct super_block *sb)
+AuStubVoid(au_sbilist_del, struct super_block *sb)
+AuStubVoid(au_sbilist_lock, void)
+AuStubVoid(au_sbilist_unlock, void)
+#define AuGFP_SBILIST	GFP_NOFS
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo)
+{
+	/*
+	 * This function is a dynamic '__init' function actually,
+	 * so the tiny check for si_rwsem is unnecessary.
+	 */
+	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
+#ifdef CONFIG_DEBUG_FS
+	sbinfo->si_dbgaufs = NULL;
+	sbinfo->si_dbgaufs_plink = NULL;
+	sbinfo->si_dbgaufs_xib = NULL;
+#ifdef CONFIG_AUFS_EXPORT
+	sbinfo->si_dbgaufs_xigen = NULL;
+#endif
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+static inline void si_pid_idx_bit(int *idx, pid_t *bit)
+{
+	/* the origin of pid is 1, but the bitmap's is 0 */
+	*bit = current->pid - 1;
+	*idx = *bit / AU_PIDSTEP;
+	*bit %= AU_PIDSTEP;
+}
+
+static inline int si_pid_test(struct super_block *sb)
+{
+	pid_t bit;
+	int idx;
+	unsigned long *bitmap;
+
+	si_pid_idx_bit(&idx, &bit);
+	bitmap = au_sbi(sb)->au_si_pid.pid_bitmap[idx];
+	if (bitmap)
+		return test_bit(bit, bitmap);
+	return 0;
+}
+
+static inline void si_pid_clr(struct super_block *sb)
+{
+	pid_t bit;
+	int idx;
+	unsigned long *bitmap;
+
+	si_pid_idx_bit(&idx, &bit);
+	bitmap = au_sbi(sb)->au_si_pid.pid_bitmap[idx];
+	BUG_ON(!bitmap);
+	AuDebugOn(!test_bit(bit, bitmap));
+	clear_bit(bit, bitmap);
+	/* smp_mb(); */
+}
+
+void si_pid_set(struct super_block *sb);
+
+/* ---------------------------------------------------------------------- */
+
+/* lock superblock. mainly for entry point functions */
+/*
+ * __si_read_lock, __si_write_lock,
+ * __si_read_unlock, __si_write_unlock, __si_downgrade_lock
+ */
+AuSimpleRwsemFuncs(__si, struct super_block *sb, &au_sbi(sb)->si_rwsem);
+
+#define SiMustNoWaiters(sb)	AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem)
+#define SiMustAnyLock(sb)	AuRwMustAnyLock(&au_sbi(sb)->si_rwsem)
+#define SiMustWriteLock(sb)	AuRwMustWriteLock(&au_sbi(sb)->si_rwsem)
+
+static inline void si_noflush_read_lock(struct super_block *sb)
+{
+	__si_read_lock(sb);
+	si_pid_set(sb);
+}
+
+static inline int si_noflush_read_trylock(struct super_block *sb)
+{
+	int locked;
+
+	locked = __si_read_trylock(sb);
+	if (locked)
+		si_pid_set(sb);
+	return locked;
+}
+
+static inline void si_noflush_write_lock(struct super_block *sb)
+{
+	__si_write_lock(sb);
+	si_pid_set(sb);
+}
+
+static inline int si_noflush_write_trylock(struct super_block *sb)
+{
+	int locked;
+
+	locked = __si_write_trylock(sb);
+	if (locked)
+		si_pid_set(sb);
+	return locked;
+}
+
+#if 0 /* reserved */
+static inline int si_read_trylock(struct super_block *sb, int flags)
+{
+	if (au_ftest_lock(flags, FLUSH))
+		au_nwt_flush(&au_sbi(sb)->si_nowait);
+	return si_noflush_read_trylock(sb);
+}
+#endif
+
+static inline void si_read_unlock(struct super_block *sb)
+{
+	si_pid_clr(sb);
+	__si_read_unlock(sb);
+}
+
+#if 0 /* reserved */
+static inline int si_write_trylock(struct super_block *sb, int flags)
+{
+	if (au_ftest_lock(flags, FLUSH))
+		au_nwt_flush(&au_sbi(sb)->si_nowait);
+	return si_noflush_write_trylock(sb);
+}
+#endif
+
+static inline void si_write_unlock(struct super_block *sb)
+{
+	si_pid_clr(sb);
+	__si_write_unlock(sb);
+}
+
+#if 0 /* reserved */
+static inline void si_downgrade_lock(struct super_block *sb)
+{
+	__si_downgrade_lock(sb);
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static inline aufs_bindex_t au_sbend(struct super_block *sb)
+{
+	SiMustAnyLock(sb);
+	return au_sbi(sb)->si_bend;
+}
+
+static inline unsigned int au_mntflags(struct super_block *sb)
+{
+	SiMustAnyLock(sb);
+	return au_sbi(sb)->si_mntflags;
+}
+
+static inline unsigned int au_sigen(struct super_block *sb)
+{
+	SiMustAnyLock(sb);
+	return au_sbi(sb)->si_generation;
+}
+
+static inline void au_ninodes_inc(struct super_block *sb)
+{
+	atomic_long_inc(&au_sbi(sb)->si_ninodes);
+}
+
+static inline void au_ninodes_dec(struct super_block *sb)
+{
+	AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_ninodes));
+	atomic_long_dec(&au_sbi(sb)->si_ninodes);
+}
+
+static inline void au_nfiles_inc(struct super_block *sb)
+{
+	atomic_long_inc(&au_sbi(sb)->si_nfiles);
+}
+
+static inline void au_nfiles_dec(struct super_block *sb)
+{
+	AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_nfiles));
+	atomic_long_dec(&au_sbi(sb)->si_nfiles);
+}
+
+static inline struct au_branch *au_sbr(struct super_block *sb,
+				       aufs_bindex_t bindex)
+{
+	SiMustAnyLock(sb);
+	return au_sbi(sb)->si_branch[0 + bindex];
+}
+
+static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid)
+{
+	SiMustWriteLock(sb);
+	au_sbi(sb)->si_xino_brid = brid;
+}
+
+static inline aufs_bindex_t au_xino_brid(struct super_block *sb)
+{
+	SiMustAnyLock(sb);
+	return au_sbi(sb)->si_xino_brid;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_SUPER_H__ */
diff --git b/fs/aufs/sysaufs.c b/fs/aufs/sysaufs.c
new file mode 100644
index 0000000..8ec10fb
--- /dev/null
+++ b/fs/aufs/sysaufs.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sysfs interface and lifetime management
+ * they are necessary regardless sysfs is disabled.
+ */
+
+#include <linux/random.h>
+#include "aufs.h"
+
+unsigned long sysaufs_si_mask;
+struct kset *sysaufs_kset;
+
+#define AuSiAttr(_name) { \
+	.attr   = { .name = __stringify(_name), .mode = 0444 },	\
+	.show   = sysaufs_si_##_name,				\
+}
+
+static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path);
+struct attribute *sysaufs_si_attrs[] = {
+	&sysaufs_si_attr_xi_path.attr,
+	NULL,
+};
+
+static const struct sysfs_ops au_sbi_ops = {
+	.show   = sysaufs_si_show
+};
+
+static struct kobj_type au_sbi_ktype = {
+	.release	= au_si_free,
+	.sysfs_ops	= &au_sbi_ops,
+	.default_attrs	= sysaufs_si_attrs
+};
+
+/* ---------------------------------------------------------------------- */
+
+int sysaufs_si_init(struct au_sbinfo *sbinfo)
+{
+	int err;
+
+	sbinfo->si_kobj.kset = sysaufs_kset;
+	/* cf. sysaufs_name() */
+	err = kobject_init_and_add
+		(&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_kset->kobj*/NULL,
+		 SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo));
+
+	dbgaufs_si_null(sbinfo);
+	if (!err) {
+		err = dbgaufs_si_init(sbinfo);
+		if (unlikely(err))
+			kobject_put(&sbinfo->si_kobj);
+	}
+	return err;
+}
+
+void sysaufs_fin(void)
+{
+	dbgaufs_fin();
+	sysfs_remove_group(&sysaufs_kset->kobj, sysaufs_attr_group);
+	kset_unregister(sysaufs_kset);
+}
+
+int __init sysaufs_init(void)
+{
+	int err;
+
+	do {
+		get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask));
+	} while (!sysaufs_si_mask);
+
+	err = -EINVAL;
+	sysaufs_kset = kset_create_and_add(AUFS_NAME, NULL, fs_kobj);
+	if (unlikely(!sysaufs_kset))
+		goto out;
+	err = PTR_ERR(sysaufs_kset);
+	if (IS_ERR(sysaufs_kset))
+		goto out;
+	err = sysfs_create_group(&sysaufs_kset->kobj, sysaufs_attr_group);
+	if (unlikely(err)) {
+		kset_unregister(sysaufs_kset);
+		goto out;
+	}
+
+	err = dbgaufs_init();
+	if (unlikely(err))
+		sysaufs_fin();
+out:
+	return err;
+}
diff --git b/fs/aufs/sysaufs.h b/fs/aufs/sysaufs.h
new file mode 100644
index 0000000..1f79983
--- /dev/null
+++ b/fs/aufs/sysaufs.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sysfs interface and mount lifetime management
+ */
+
+#ifndef __SYSAUFS_H__
+#define __SYSAUFS_H__
+
+#ifdef __KERNEL__
+
+#include <linux/sysfs.h>
+#include "module.h"
+
+struct super_block;
+struct au_sbinfo;
+
+struct sysaufs_si_attr {
+	struct attribute attr;
+	int (*show)(struct seq_file *seq, struct super_block *sb);
+};
+
+/* ---------------------------------------------------------------------- */
+
+/* sysaufs.c */
+extern unsigned long sysaufs_si_mask;
+extern struct kset *sysaufs_kset;
+extern struct attribute *sysaufs_si_attrs[];
+int sysaufs_si_init(struct au_sbinfo *sbinfo);
+int __init sysaufs_init(void);
+void sysaufs_fin(void);
+
+/* ---------------------------------------------------------------------- */
+
+/* some people doesn't like to show a pointer in kernel */
+static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo)
+{
+	return sysaufs_si_mask ^ (unsigned long)sbinfo;
+}
+
+#define SysaufsSiNamePrefix	"si_"
+#define SysaufsSiNameLen	(sizeof(SysaufsSiNamePrefix) + 16)
+static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name)
+{
+	snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx",
+		 sysaufs_si_id(sbinfo));
+}
+
+struct au_branch;
+#ifdef CONFIG_SYSFS
+/* sysfs.c */
+extern struct attribute_group *sysaufs_attr_group;
+
+int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb);
+ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+long au_brinfo_ioctl(struct file *file, unsigned long arg);
+#ifdef CONFIG_COMPAT
+long au_brinfo_compat_ioctl(struct file *file, unsigned long arg);
+#endif
+
+void sysaufs_br_init(struct au_branch *br);
+void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex);
+void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex);
+
+#define sysaufs_brs_init()	do {} while (0)
+
+#else
+#define sysaufs_attr_group	NULL
+
+AuStubInt0(sysaufs_si_xi_path, struct seq_file *seq, struct super_block *sb)
+AuStub(ssize_t, sysaufs_si_show, return 0, struct kobject *kobj,
+       struct attribute *attr, char *buf)
+AuStubVoid(sysaufs_br_init, struct au_branch *br)
+AuStubVoid(sysaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex)
+AuStubVoid(sysaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex)
+
+static inline void sysaufs_brs_init(void)
+{
+	sysaufs_brs = 0;
+}
+
+#endif /* CONFIG_SYSFS */
+
+#endif /* __KERNEL__ */
+#endif /* __SYSAUFS_H__ */
diff --git b/fs/aufs/sysfs.c b/fs/aufs/sysfs.c
new file mode 100644
index 0000000..ed42f53
--- /dev/null
+++ b/fs/aufs/sysfs.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sysfs interface
+ */
+
+#include <linux/compat.h>
+#include <linux/seq_file.h>
+#include "aufs.h"
+
+static struct attribute *au_attr[] = {
+	NULL,	/* need to NULL terminate the list of attributes */
+};
+
+static struct attribute_group sysaufs_attr_group_body = {
+	.attrs = au_attr
+};
+
+struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body;
+
+/* ---------------------------------------------------------------------- */
+
+int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb)
+{
+	int err;
+
+	SiMustAnyLock(sb);
+
+	err = 0;
+	if (au_opt_test(au_mntflags(sb), XINO)) {
+		err = au_xino_path(seq, au_sbi(sb)->si_xib);
+		seq_putc(seq, '\n');
+	}
+	return err;
+}
+
+/*
+ * the lifetime of branch is independent from the entry under sysfs.
+ * sysfs handles the lifetime of the entry, and never call ->show() after it is
+ * unlinked.
+ */
+static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb,
+			 aufs_bindex_t bindex, int idx)
+{
+	int err;
+	struct path path;
+	struct dentry *root;
+	struct au_branch *br;
+	au_br_perm_str_t perm;
+
+	AuDbg("b%d\n", bindex);
+
+	err = 0;
+	root = sb->s_root;
+	di_read_lock_parent(root, !AuLock_IR);
+	br = au_sbr(sb, bindex);
+
+	switch (idx) {
+	case AuBrSysfs_BR:
+		path.mnt = au_br_mnt(br);
+		path.dentry = au_h_dptr(root, bindex);
+		err = au_seq_path(seq, &path);
+		if (!err) {
+			au_optstr_br_perm(&perm, br->br_perm);
+			seq_printf(seq, "=%s\n", perm.a);
+		}
+		break;
+	case AuBrSysfs_BRID:
+		seq_printf(seq, "%d\n", br->br_id);
+		break;
+	}
+	di_read_unlock(root, !AuLock_IR);
+	if (unlikely(err || seq_has_overflowed(seq)))
+		err = -E2BIG;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct seq_file *au_seq(char *p, ssize_t len)
+{
+	struct seq_file *seq;
+
+	seq = kzalloc(sizeof(*seq), GFP_NOFS);
+	if (seq) {
+		/* mutex_init(&seq.lock); */
+		seq->buf = p;
+		seq->size = len;
+		return seq; /* success */
+	}
+
+	seq = ERR_PTR(-ENOMEM);
+	return seq;
+}
+
+#define SysaufsBr_PREFIX	"br"
+#define SysaufsBrid_PREFIX	"brid"
+
+/* todo: file size may exceed PAGE_SIZE */
+ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr,
+			char *buf)
+{
+	ssize_t err;
+	int idx;
+	long l;
+	aufs_bindex_t bend;
+	struct au_sbinfo *sbinfo;
+	struct super_block *sb;
+	struct seq_file *seq;
+	char *name;
+	struct attribute **cattr;
+
+	sbinfo = container_of(kobj, struct au_sbinfo, si_kobj);
+	sb = sbinfo->si_sb;
+
+	/*
+	 * prevent a race condition between sysfs and aufs.
+	 * for instance, sysfs_file_read() calls sysfs_get_active_two() which
+	 * prohibits maintaining the sysfs entries.
+	 * hew we acquire read lock after sysfs_get_active_two().
+	 * on the other hand, the remount process may maintain the sysfs/aufs
+	 * entries after acquiring write lock.
+	 * it can cause a deadlock.
+	 * simply we gave up processing read here.
+	 */
+	err = -EBUSY;
+	if (unlikely(!si_noflush_read_trylock(sb)))
+		goto out;
+
+	seq = au_seq(buf, PAGE_SIZE);
+	err = PTR_ERR(seq);
+	if (IS_ERR(seq))
+		goto out_unlock;
+
+	name = (void *)attr->name;
+	cattr = sysaufs_si_attrs;
+	while (*cattr) {
+		if (!strcmp(name, (*cattr)->name)) {
+			err = container_of(*cattr, struct sysaufs_si_attr, attr)
+				->show(seq, sb);
+			goto out_seq;
+		}
+		cattr++;
+	}
+
+	if (!strncmp(name, SysaufsBrid_PREFIX,
+		     sizeof(SysaufsBrid_PREFIX) - 1)) {
+		idx = AuBrSysfs_BRID;
+		name += sizeof(SysaufsBrid_PREFIX) - 1;
+	} else if (!strncmp(name, SysaufsBr_PREFIX,
+			    sizeof(SysaufsBr_PREFIX) - 1)) {
+		idx = AuBrSysfs_BR;
+		name += sizeof(SysaufsBr_PREFIX) - 1;
+	} else
+		  BUG();
+
+	err = kstrtol(name, 10, &l);
+	if (!err) {
+		bend = au_sbend(sb);
+		if (l <= bend)
+			err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l, idx);
+		else
+			err = -ENOENT;
+	}
+
+out_seq:
+	if (!err) {
+		err = seq->count;
+		/* sysfs limit */
+		if (unlikely(err == PAGE_SIZE))
+			err = -EFBIG;
+	}
+	kfree(seq);
+out_unlock:
+	si_read_unlock(sb);
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_brinfo(struct super_block *sb, union aufs_brinfo __user *arg)
+{
+	int err;
+	int16_t brid;
+	aufs_bindex_t bindex, bend;
+	size_t sz;
+	char *buf;
+	struct seq_file *seq;
+	struct au_branch *br;
+
+	si_read_lock(sb, AuLock_FLUSH);
+	bend = au_sbend(sb);
+	err = bend + 1;
+	if (!arg)
+		goto out;
+
+	err = -ENOMEM;
+	buf = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!buf))
+		goto out;
+
+	seq = au_seq(buf, PAGE_SIZE);
+	err = PTR_ERR(seq);
+	if (IS_ERR(seq))
+		goto out_buf;
+
+	sz = sizeof(*arg) - offsetof(union aufs_brinfo, path);
+	for (bindex = 0; bindex <= bend; bindex++, arg++) {
+		err = !access_ok(VERIFY_WRITE, arg, sizeof(*arg));
+		if (unlikely(err))
+			break;
+
+		br = au_sbr(sb, bindex);
+		brid = br->br_id;
+		BUILD_BUG_ON(sizeof(brid) != sizeof(arg->id));
+		err = __put_user(brid, &arg->id);
+		if (unlikely(err))
+			break;
+
+		BUILD_BUG_ON(sizeof(br->br_perm) != sizeof(arg->perm));
+		err = __put_user(br->br_perm, &arg->perm);
+		if (unlikely(err))
+			break;
+
+		err = au_seq_path(seq, &br->br_path);
+		if (unlikely(err))
+			break;
+		seq_putc(seq, '\0');
+		if (!seq_has_overflowed(seq)) {
+			err = copy_to_user(arg->path, seq->buf, seq->count);
+			seq->count = 0;
+			if (unlikely(err))
+				break;
+		} else {
+			err = -E2BIG;
+			goto out_seq;
+		}
+	}
+	if (unlikely(err))
+		err = -EFAULT;
+
+out_seq:
+	kfree(seq);
+out_buf:
+	free_page((unsigned long)buf);
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+long au_brinfo_ioctl(struct file *file, unsigned long arg)
+{
+	return au_brinfo(file->f_path.dentry->d_sb, (void __user *)arg);
+}
+
+#ifdef CONFIG_COMPAT
+long au_brinfo_compat_ioctl(struct file *file, unsigned long arg)
+{
+	return au_brinfo(file->f_path.dentry->d_sb, compat_ptr(arg));
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+void sysaufs_br_init(struct au_branch *br)
+{
+	int i;
+	struct au_brsysfs *br_sysfs;
+	struct attribute *attr;
+
+	br_sysfs = br->br_sysfs;
+	for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
+		attr = &br_sysfs->attr;
+		sysfs_attr_init(attr);
+		attr->name = br_sysfs->name;
+		attr->mode = S_IRUGO;
+		br_sysfs++;
+	}
+}
+
+void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex)
+{
+	struct au_branch *br;
+	struct kobject *kobj;
+	struct au_brsysfs *br_sysfs;
+	int i;
+	aufs_bindex_t bend;
+
+	dbgaufs_brs_del(sb, bindex);
+
+	if (!sysaufs_brs)
+		return;
+
+	kobj = &au_sbi(sb)->si_kobj;
+	bend = au_sbend(sb);
+	for (; bindex <= bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		br_sysfs = br->br_sysfs;
+		for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
+			sysfs_remove_file(kobj, &br_sysfs->attr);
+			br_sysfs++;
+		}
+	}
+}
+
+void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex)
+{
+	int err, i;
+	aufs_bindex_t bend;
+	struct kobject *kobj;
+	struct au_branch *br;
+	struct au_brsysfs *br_sysfs;
+
+	dbgaufs_brs_add(sb, bindex);
+
+	if (!sysaufs_brs)
+		return;
+
+	kobj = &au_sbi(sb)->si_kobj;
+	bend = au_sbend(sb);
+	for (; bindex <= bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		br_sysfs = br->br_sysfs;
+		snprintf(br_sysfs[AuBrSysfs_BR].name, sizeof(br_sysfs->name),
+			 SysaufsBr_PREFIX "%d", bindex);
+		snprintf(br_sysfs[AuBrSysfs_BRID].name, sizeof(br_sysfs->name),
+			 SysaufsBrid_PREFIX "%d", bindex);
+		for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
+			err = sysfs_create_file(kobj, &br_sysfs->attr);
+			if (unlikely(err))
+				pr_warn("failed %s under sysfs(%d)\n",
+					br_sysfs->name, err);
+			br_sysfs++;
+		}
+	}
+}
diff --git b/fs/aufs/sysrq.c b/fs/aufs/sysrq.c
new file mode 100644
index 0000000..7d22979
--- /dev/null
+++ b/fs/aufs/sysrq.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * magic sysrq hanlder
+ */
+
+/* #include <linux/sysrq.h> */
+#include <linux/writeback.h>
+#include "aufs.h"
+
+/* ---------------------------------------------------------------------- */
+
+static void sysrq_sb(struct super_block *sb)
+{
+	char *plevel;
+	struct au_sbinfo *sbinfo;
+	struct file *file;
+	struct au_sphlhead *files;
+	struct au_finfo *finfo;
+
+	plevel = au_plevel;
+	au_plevel = KERN_WARNING;
+
+	/* since we define pr_fmt, call printk directly */
+#define pr(str) printk(KERN_WARNING AUFS_NAME ": " str)
+
+	sbinfo = au_sbi(sb);
+	printk(KERN_WARNING "si=%lx\n", sysaufs_si_id(sbinfo));
+	pr("superblock\n");
+	au_dpri_sb(sb);
+
+#if 0
+	pr("root dentry\n");
+	au_dpri_dentry(sb->s_root);
+	pr("root inode\n");
+	au_dpri_inode(d_inode(sb->s_root));
+#endif
+
+#if 0
+	do {
+		int err, i, j, ndentry;
+		struct au_dcsub_pages dpages;
+		struct au_dpage *dpage;
+
+		err = au_dpages_init(&dpages, GFP_ATOMIC);
+		if (unlikely(err))
+			break;
+		err = au_dcsub_pages(&dpages, sb->s_root, NULL, NULL);
+		if (!err)
+			for (i = 0; i < dpages.ndpage; i++) {
+				dpage = dpages.dpages + i;
+				ndentry = dpage->ndentry;
+				for (j = 0; j < ndentry; j++)
+					au_dpri_dentry(dpage->dentries[j]);
+			}
+		au_dpages_free(&dpages);
+	} while (0);
+#endif
+
+#if 1
+	{
+		struct inode *i;
+
+		pr("isolated inode\n");
+		spin_lock(&sb->s_inode_list_lock);
+		list_for_each_entry(i, &sb->s_inodes, i_sb_list) {
+			spin_lock(&i->i_lock);
+			if (1 || hlist_empty(&i->i_dentry))
+				au_dpri_inode(i);
+			spin_unlock(&i->i_lock);
+		}
+		spin_unlock(&sb->s_inode_list_lock);
+	}
+#endif
+	pr("files\n");
+	files = &au_sbi(sb)->si_files;
+	spin_lock(&files->spin);
+	hlist_for_each_entry(finfo, &files->head, fi_hlist) {
+		umode_t mode;
+
+		file = finfo->fi_file;
+		mode = file_inode(file)->i_mode;
+		if (!special_file(mode))
+			au_dpri_file(file);
+	}
+	spin_unlock(&files->spin);
+	pr("done\n");
+
+#undef pr
+	au_plevel = plevel;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* module parameter */
+static char *aufs_sysrq_key = "a";
+module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO);
+MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME);
+
+static void au_sysrq(int key __maybe_unused)
+{
+	struct au_sbinfo *sbinfo;
+
+	lockdep_off();
+	au_sbilist_lock();
+	hlist_for_each_entry(sbinfo, &au_sbilist.head, si_list)
+		sysrq_sb(sbinfo->si_sb);
+	au_sbilist_unlock();
+	lockdep_on();
+}
+
+static struct sysrq_key_op au_sysrq_op = {
+	.handler	= au_sysrq,
+	.help_msg	= "Aufs",
+	.action_msg	= "Aufs",
+	.enable_mask	= SYSRQ_ENABLE_DUMP
+};
+
+/* ---------------------------------------------------------------------- */
+
+int __init au_sysrq_init(void)
+{
+	int err;
+	char key;
+
+	err = -1;
+	key = *aufs_sysrq_key;
+	if ('a' <= key && key <= 'z')
+		err = register_sysrq_key(key, &au_sysrq_op);
+	if (unlikely(err))
+		pr_err("err %d, sysrq=%c\n", err, key);
+	return err;
+}
+
+void au_sysrq_fin(void)
+{
+	int err;
+
+	err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op);
+	if (unlikely(err))
+		pr_err("err %d (ignored)\n", err);
+}
diff --git b/fs/aufs/vdir.c b/fs/aufs/vdir.c
new file mode 100644
index 0000000..f64cc2b
--- /dev/null
+++ b/fs/aufs/vdir.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * virtual or vertical directory
+ */
+
+#include "aufs.h"
+
+static unsigned int calc_size(int nlen)
+{
+	return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t));
+}
+
+static int set_deblk_end(union au_vdir_deblk_p *p,
+			 union au_vdir_deblk_p *deblk_end)
+{
+	if (calc_size(0) <= deblk_end->deblk - p->deblk) {
+		p->de->de_str.len = 0;
+		/* smp_mb(); */
+		return 0;
+	}
+	return -1; /* error */
+}
+
+/* returns true or false */
+static int is_deblk_end(union au_vdir_deblk_p *p,
+			union au_vdir_deblk_p *deblk_end)
+{
+	if (calc_size(0) <= deblk_end->deblk - p->deblk)
+		return !p->de->de_str.len;
+	return 1;
+}
+
+static unsigned char *last_deblk(struct au_vdir *vdir)
+{
+	return vdir->vd_deblk[vdir->vd_nblk - 1];
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* estimate the appropriate size for name hash table */
+unsigned int au_rdhash_est(loff_t sz)
+{
+	unsigned int n;
+
+	n = UINT_MAX;
+	sz >>= 10;
+	if (sz < n)
+		n = sz;
+	if (sz < AUFS_RDHASH_DEF)
+		n = AUFS_RDHASH_DEF;
+	/* pr_info("n %u\n", n); */
+	return n;
+}
+
+/*
+ * the allocated memory has to be freed by
+ * au_nhash_wh_free() or au_nhash_de_free().
+ */
+int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp)
+{
+	struct hlist_head *head;
+	unsigned int u;
+	size_t sz;
+
+	sz = sizeof(*nhash->nh_head) * num_hash;
+	head = kmalloc(sz, gfp);
+	if (head) {
+		nhash->nh_num = num_hash;
+		nhash->nh_head = head;
+		for (u = 0; u < num_hash; u++)
+			INIT_HLIST_HEAD(head++);
+		return 0; /* success */
+	}
+
+	return -ENOMEM;
+}
+
+static void nhash_count(struct hlist_head *head)
+{
+#if 0
+	unsigned long n;
+	struct hlist_node *pos;
+
+	n = 0;
+	hlist_for_each(pos, head)
+		n++;
+	pr_info("%lu\n", n);
+#endif
+}
+
+static void au_nhash_wh_do_free(struct hlist_head *head)
+{
+	struct au_vdir_wh *pos;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_safe(pos, node, head, wh_hash)
+		kfree(pos);
+}
+
+static void au_nhash_de_do_free(struct hlist_head *head)
+{
+	struct au_vdir_dehstr *pos;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_safe(pos, node, head, hash)
+		au_cache_free_vdir_dehstr(pos);
+}
+
+static void au_nhash_do_free(struct au_nhash *nhash,
+			     void (*free)(struct hlist_head *head))
+{
+	unsigned int n;
+	struct hlist_head *head;
+
+	n = nhash->nh_num;
+	if (!n)
+		return;
+
+	head = nhash->nh_head;
+	while (n-- > 0) {
+		nhash_count(head);
+		free(head++);
+	}
+	kfree(nhash->nh_head);
+}
+
+void au_nhash_wh_free(struct au_nhash *whlist)
+{
+	au_nhash_do_free(whlist, au_nhash_wh_do_free);
+}
+
+static void au_nhash_de_free(struct au_nhash *delist)
+{
+	au_nhash_do_free(delist, au_nhash_de_do_free);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt,
+			    int limit)
+{
+	int num;
+	unsigned int u, n;
+	struct hlist_head *head;
+	struct au_vdir_wh *pos;
+
+	num = 0;
+	n = whlist->nh_num;
+	head = whlist->nh_head;
+	for (u = 0; u < n; u++, head++)
+		hlist_for_each_entry(pos, head, wh_hash)
+			if (pos->wh_bindex == btgt && ++num > limit)
+				return 1;
+	return 0;
+}
+
+static struct hlist_head *au_name_hash(struct au_nhash *nhash,
+				       unsigned char *name,
+				       unsigned int len)
+{
+	unsigned int v;
+	/* const unsigned int magic_bit = 12; */
+
+	AuDebugOn(!nhash->nh_num || !nhash->nh_head);
+
+	v = 0;
+	while (len--)
+		v += *name++;
+	/* v = hash_long(v, magic_bit); */
+	v %= nhash->nh_num;
+	return nhash->nh_head + v;
+}
+
+static int au_nhash_test_name(struct au_vdir_destr *str, const char *name,
+			      int nlen)
+{
+	return str->len == nlen && !memcmp(str->name, name, nlen);
+}
+
+/* returns found or not */
+int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen)
+{
+	struct hlist_head *head;
+	struct au_vdir_wh *pos;
+	struct au_vdir_destr *str;
+
+	head = au_name_hash(whlist, name, nlen);
+	hlist_for_each_entry(pos, head, wh_hash) {
+		str = &pos->wh_str;
+		AuDbg("%.*s\n", str->len, str->name);
+		if (au_nhash_test_name(str, name, nlen))
+			return 1;
+	}
+	return 0;
+}
+
+/* returns found(true) or not */
+static int test_known(struct au_nhash *delist, char *name, int nlen)
+{
+	struct hlist_head *head;
+	struct au_vdir_dehstr *pos;
+	struct au_vdir_destr *str;
+
+	head = au_name_hash(delist, name, nlen);
+	hlist_for_each_entry(pos, head, hash) {
+		str = pos->str;
+		AuDbg("%.*s\n", str->len, str->name);
+		if (au_nhash_test_name(str, name, nlen))
+			return 1;
+	}
+	return 0;
+}
+
+static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino,
+			    unsigned char d_type)
+{
+#ifdef CONFIG_AUFS_SHWH
+	wh->wh_ino = ino;
+	wh->wh_type = d_type;
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino,
+		       unsigned int d_type, aufs_bindex_t bindex,
+		       unsigned char shwh)
+{
+	int err;
+	struct au_vdir_destr *str;
+	struct au_vdir_wh *wh;
+
+	AuDbg("%.*s\n", nlen, name);
+	AuDebugOn(!whlist->nh_num || !whlist->nh_head);
+
+	err = -ENOMEM;
+	wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS);
+	if (unlikely(!wh))
+		goto out;
+
+	err = 0;
+	wh->wh_bindex = bindex;
+	if (shwh)
+		au_shwh_init_wh(wh, ino, d_type);
+	str = &wh->wh_str;
+	str->len = nlen;
+	memcpy(str->name, name, nlen);
+	hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen));
+	/* smp_mb(); */
+
+out:
+	return err;
+}
+
+static int append_deblk(struct au_vdir *vdir)
+{
+	int err;
+	unsigned long ul;
+	const unsigned int deblk_sz = vdir->vd_deblk_sz;
+	union au_vdir_deblk_p p, deblk_end;
+	unsigned char **o;
+
+	err = -ENOMEM;
+	o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1),
+		     GFP_NOFS);
+	if (unlikely(!o))
+		goto out;
+
+	vdir->vd_deblk = o;
+	p.deblk = kmalloc(deblk_sz, GFP_NOFS);
+	if (p.deblk) {
+		ul = vdir->vd_nblk++;
+		vdir->vd_deblk[ul] = p.deblk;
+		vdir->vd_last.ul = ul;
+		vdir->vd_last.p.deblk = p.deblk;
+		deblk_end.deblk = p.deblk + deblk_sz;
+		err = set_deblk_end(&p, &deblk_end);
+	}
+
+out:
+	return err;
+}
+
+static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino,
+		     unsigned int d_type, struct au_nhash *delist)
+{
+	int err;
+	unsigned int sz;
+	const unsigned int deblk_sz = vdir->vd_deblk_sz;
+	union au_vdir_deblk_p p, *room, deblk_end;
+	struct au_vdir_dehstr *dehstr;
+
+	p.deblk = last_deblk(vdir);
+	deblk_end.deblk = p.deblk + deblk_sz;
+	room = &vdir->vd_last.p;
+	AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk
+		  || !is_deblk_end(room, &deblk_end));
+
+	sz = calc_size(nlen);
+	if (unlikely(sz > deblk_end.deblk - room->deblk)) {
+		err = append_deblk(vdir);
+		if (unlikely(err))
+			goto out;
+
+		p.deblk = last_deblk(vdir);
+		deblk_end.deblk = p.deblk + deblk_sz;
+		/* smp_mb(); */
+		AuDebugOn(room->deblk != p.deblk);
+	}
+
+	err = -ENOMEM;
+	dehstr = au_cache_alloc_vdir_dehstr();
+	if (unlikely(!dehstr))
+		goto out;
+
+	dehstr->str = &room->de->de_str;
+	hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen));
+	room->de->de_ino = ino;
+	room->de->de_type = d_type;
+	room->de->de_str.len = nlen;
+	memcpy(room->de->de_str.name, name, nlen);
+
+	err = 0;
+	room->deblk += sz;
+	if (unlikely(set_deblk_end(room, &deblk_end)))
+		err = append_deblk(vdir);
+	/* smp_mb(); */
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_vdir_free(struct au_vdir *vdir)
+{
+	unsigned char **deblk;
+
+	deblk = vdir->vd_deblk;
+	while (vdir->vd_nblk--)
+		kfree(*deblk++);
+	kfree(vdir->vd_deblk);
+	au_cache_free_vdir(vdir);
+}
+
+static struct au_vdir *alloc_vdir(struct file *file)
+{
+	struct au_vdir *vdir;
+	struct super_block *sb;
+	int err;
+
+	sb = file->f_path.dentry->d_sb;
+	SiMustAnyLock(sb);
+
+	err = -ENOMEM;
+	vdir = au_cache_alloc_vdir();
+	if (unlikely(!vdir))
+		goto out;
+
+	vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS);
+	if (unlikely(!vdir->vd_deblk))
+		goto out_free;
+
+	vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk;
+	if (!vdir->vd_deblk_sz) {
+		/* estimate the appropriate size for deblk */
+		vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL);
+		/* pr_info("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */
+	}
+	vdir->vd_nblk = 0;
+	vdir->vd_version = 0;
+	vdir->vd_jiffy = 0;
+	err = append_deblk(vdir);
+	if (!err)
+		return vdir; /* success */
+
+	kfree(vdir->vd_deblk);
+
+out_free:
+	au_cache_free_vdir(vdir);
+out:
+	vdir = ERR_PTR(err);
+	return vdir;
+}
+
+static int reinit_vdir(struct au_vdir *vdir)
+{
+	int err;
+	union au_vdir_deblk_p p, deblk_end;
+
+	while (vdir->vd_nblk > 1) {
+		kfree(vdir->vd_deblk[vdir->vd_nblk - 1]);
+		/* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */
+		vdir->vd_nblk--;
+	}
+	p.deblk = vdir->vd_deblk[0];
+	deblk_end.deblk = p.deblk + vdir->vd_deblk_sz;
+	err = set_deblk_end(&p, &deblk_end);
+	/* keep vd_dblk_sz */
+	vdir->vd_last.ul = 0;
+	vdir->vd_last.p.deblk = vdir->vd_deblk[0];
+	vdir->vd_version = 0;
+	vdir->vd_jiffy = 0;
+	/* smp_mb(); */
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#define AuFillVdir_CALLED	1
+#define AuFillVdir_WHABLE	(1 << 1)
+#define AuFillVdir_SHWH		(1 << 2)
+#define au_ftest_fillvdir(flags, name)	((flags) & AuFillVdir_##name)
+#define au_fset_fillvdir(flags, name) \
+	do { (flags) |= AuFillVdir_##name; } while (0)
+#define au_fclr_fillvdir(flags, name) \
+	do { (flags) &= ~AuFillVdir_##name; } while (0)
+
+#ifndef CONFIG_AUFS_SHWH
+#undef AuFillVdir_SHWH
+#define AuFillVdir_SHWH		0
+#endif
+
+struct fillvdir_arg {
+	struct dir_context	ctx;
+	struct file		*file;
+	struct au_vdir		*vdir;
+	struct au_nhash		delist;
+	struct au_nhash		whlist;
+	aufs_bindex_t		bindex;
+	unsigned int		flags;
+	int			err;
+};
+
+static int fillvdir(struct dir_context *ctx, const char *__name, int nlen,
+		    loff_t offset __maybe_unused, u64 h_ino,
+		    unsigned int d_type)
+{
+	struct fillvdir_arg *arg = container_of(ctx, struct fillvdir_arg, ctx);
+	char *name = (void *)__name;
+	struct super_block *sb;
+	ino_t ino;
+	const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH);
+
+	arg->err = 0;
+	sb = arg->file->f_path.dentry->d_sb;
+	au_fset_fillvdir(arg->flags, CALLED);
+	/* smp_mb(); */
+	if (nlen <= AUFS_WH_PFX_LEN
+	    || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
+		if (test_known(&arg->delist, name, nlen)
+		    || au_nhash_test_known_wh(&arg->whlist, name, nlen))
+			goto out; /* already exists or whiteouted */
+
+		arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino);
+		if (!arg->err) {
+			if (unlikely(nlen > AUFS_MAX_NAMELEN))
+				d_type = DT_UNKNOWN;
+			arg->err = append_de(arg->vdir, name, nlen, ino,
+					     d_type, &arg->delist);
+		}
+	} else if (au_ftest_fillvdir(arg->flags, WHABLE)) {
+		name += AUFS_WH_PFX_LEN;
+		nlen -= AUFS_WH_PFX_LEN;
+		if (au_nhash_test_known_wh(&arg->whlist, name, nlen))
+			goto out; /* already whiteouted */
+
+		if (shwh)
+			arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type,
+					     &ino);
+		if (!arg->err) {
+			if (nlen <= AUFS_MAX_NAMELEN + AUFS_WH_PFX_LEN)
+				d_type = DT_UNKNOWN;
+			arg->err = au_nhash_append_wh
+				(&arg->whlist, name, nlen, ino, d_type,
+				 arg->bindex, shwh);
+		}
+	}
+
+out:
+	if (!arg->err)
+		arg->vdir->vd_jiffy = jiffies;
+	/* smp_mb(); */
+	AuTraceErr(arg->err);
+	return arg->err;
+}
+
+static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir,
+			  struct au_nhash *whlist, struct au_nhash *delist)
+{
+#ifdef CONFIG_AUFS_SHWH
+	int err;
+	unsigned int nh, u;
+	struct hlist_head *head;
+	struct au_vdir_wh *pos;
+	struct hlist_node *n;
+	char *p, *o;
+	struct au_vdir_destr *destr;
+
+	AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH));
+
+	err = -ENOMEM;
+	o = p = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!p))
+		goto out;
+
+	err = 0;
+	nh = whlist->nh_num;
+	memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
+	p += AUFS_WH_PFX_LEN;
+	for (u = 0; u < nh; u++) {
+		head = whlist->nh_head + u;
+		hlist_for_each_entry_safe(pos, n, head, wh_hash) {
+			destr = &pos->wh_str;
+			memcpy(p, destr->name, destr->len);
+			err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN,
+					pos->wh_ino, pos->wh_type, delist);
+			if (unlikely(err))
+				break;
+		}
+	}
+
+	free_page((unsigned long)o);
+
+out:
+	AuTraceErr(err);
+	return err;
+#else
+	return 0;
+#endif
+}
+
+static int au_do_read_vdir(struct fillvdir_arg *arg)
+{
+	int err;
+	unsigned int rdhash;
+	loff_t offset;
+	aufs_bindex_t bend, bindex, bstart;
+	unsigned char shwh;
+	struct file *hf, *file;
+	struct super_block *sb;
+
+	file = arg->file;
+	sb = file->f_path.dentry->d_sb;
+	SiMustAnyLock(sb);
+
+	rdhash = au_sbi(sb)->si_rdhash;
+	if (!rdhash)
+		rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL));
+	err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS);
+	if (unlikely(err))
+		goto out_delist;
+
+	err = 0;
+	arg->flags = 0;
+	shwh = 0;
+	if (au_opt_test(au_mntflags(sb), SHWH)) {
+		shwh = 1;
+		au_fset_fillvdir(arg->flags, SHWH);
+	}
+	bstart = au_fbstart(file);
+	bend = au_fbend_dir(file);
+	for (bindex = bstart; !err && bindex <= bend; bindex++) {
+		hf = au_hf_dir(file, bindex);
+		if (!hf)
+			continue;
+
+		offset = vfsub_llseek(hf, 0, SEEK_SET);
+		err = offset;
+		if (unlikely(offset))
+			break;
+
+		arg->bindex = bindex;
+		au_fclr_fillvdir(arg->flags, WHABLE);
+		if (shwh
+		    || (bindex != bend
+			&& au_br_whable(au_sbr_perm(sb, bindex))))
+			au_fset_fillvdir(arg->flags, WHABLE);
+		do {
+			arg->err = 0;
+			au_fclr_fillvdir(arg->flags, CALLED);
+			/* smp_mb(); */
+			err = vfsub_iterate_dir(hf, &arg->ctx);
+			if (err >= 0)
+				err = arg->err;
+		} while (!err && au_ftest_fillvdir(arg->flags, CALLED));
+
+		/*
+		 * dir_relax() may be good for concurrency, but aufs should not
+		 * use it since it will cause a lockdep problem.
+		 */
+	}
+
+	if (!err && shwh)
+		err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist);
+
+	au_nhash_wh_free(&arg->whlist);
+
+out_delist:
+	au_nhash_de_free(&arg->delist);
+out:
+	return err;
+}
+
+static int read_vdir(struct file *file, int may_read)
+{
+	int err;
+	unsigned long expire;
+	unsigned char do_read;
+	struct fillvdir_arg arg = {
+		.ctx = {
+			.actor = fillvdir
+		}
+	};
+	struct inode *inode;
+	struct au_vdir *vdir, *allocated;
+
+	err = 0;
+	inode = file_inode(file);
+	IMustLock(inode);
+	SiMustAnyLock(inode->i_sb);
+
+	allocated = NULL;
+	do_read = 0;
+	expire = au_sbi(inode->i_sb)->si_rdcache;
+	vdir = au_ivdir(inode);
+	if (!vdir) {
+		do_read = 1;
+		vdir = alloc_vdir(file);
+		err = PTR_ERR(vdir);
+		if (IS_ERR(vdir))
+			goto out;
+		err = 0;
+		allocated = vdir;
+	} else if (may_read
+		   && (inode->i_version != vdir->vd_version
+		       || time_after(jiffies, vdir->vd_jiffy + expire))) {
+		do_read = 1;
+		err = reinit_vdir(vdir);
+		if (unlikely(err))
+			goto out;
+	}
+
+	if (!do_read)
+		return 0; /* success */
+
+	arg.file = file;
+	arg.vdir = vdir;
+	err = au_do_read_vdir(&arg);
+	if (!err) {
+		/* file->f_pos = 0; */ /* todo: ctx->pos? */
+		vdir->vd_version = inode->i_version;
+		vdir->vd_last.ul = 0;
+		vdir->vd_last.p.deblk = vdir->vd_deblk[0];
+		if (allocated)
+			au_set_ivdir(inode, allocated);
+	} else if (allocated)
+		au_vdir_free(allocated);
+
+out:
+	return err;
+}
+
+static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src)
+{
+	int err, rerr;
+	unsigned long ul, n;
+	const unsigned int deblk_sz = src->vd_deblk_sz;
+
+	AuDebugOn(tgt->vd_nblk != 1);
+
+	err = -ENOMEM;
+	if (tgt->vd_nblk < src->vd_nblk) {
+		unsigned char **p;
+
+		p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk,
+			     GFP_NOFS);
+		if (unlikely(!p))
+			goto out;
+		tgt->vd_deblk = p;
+	}
+
+	if (tgt->vd_deblk_sz != deblk_sz) {
+		unsigned char *p;
+
+		tgt->vd_deblk_sz = deblk_sz;
+		p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS);
+		if (unlikely(!p))
+			goto out;
+		tgt->vd_deblk[0] = p;
+	}
+	memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz);
+	tgt->vd_version = src->vd_version;
+	tgt->vd_jiffy = src->vd_jiffy;
+
+	n = src->vd_nblk;
+	for (ul = 1; ul < n; ul++) {
+		tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz,
+					    GFP_NOFS);
+		if (unlikely(!tgt->vd_deblk[ul]))
+			goto out;
+		tgt->vd_nblk++;
+	}
+	tgt->vd_nblk = n;
+	tgt->vd_last.ul = tgt->vd_last.ul;
+	tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul];
+	tgt->vd_last.p.deblk += src->vd_last.p.deblk
+		- src->vd_deblk[src->vd_last.ul];
+	/* smp_mb(); */
+	return 0; /* success */
+
+out:
+	rerr = reinit_vdir(tgt);
+	BUG_ON(rerr);
+	return err;
+}
+
+int au_vdir_init(struct file *file)
+{
+	int err;
+	struct inode *inode;
+	struct au_vdir *vdir_cache, *allocated;
+
+	/* test file->f_pos here instead of ctx->pos */
+	err = read_vdir(file, !file->f_pos);
+	if (unlikely(err))
+		goto out;
+
+	allocated = NULL;
+	vdir_cache = au_fvdir_cache(file);
+	if (!vdir_cache) {
+		vdir_cache = alloc_vdir(file);
+		err = PTR_ERR(vdir_cache);
+		if (IS_ERR(vdir_cache))
+			goto out;
+		allocated = vdir_cache;
+	} else if (!file->f_pos && vdir_cache->vd_version != file->f_version) {
+		/* test file->f_pos here instead of ctx->pos */
+		err = reinit_vdir(vdir_cache);
+		if (unlikely(err))
+			goto out;
+	} else
+		return 0; /* success */
+
+	inode = file_inode(file);
+	err = copy_vdir(vdir_cache, au_ivdir(inode));
+	if (!err) {
+		file->f_version = inode->i_version;
+		if (allocated)
+			au_set_fvdir_cache(file, allocated);
+	} else if (allocated)
+		au_vdir_free(allocated);
+
+out:
+	return err;
+}
+
+static loff_t calc_offset(struct au_vdir *vdir)
+{
+	loff_t offset;
+	union au_vdir_deblk_p p;
+
+	p.deblk = vdir->vd_deblk[vdir->vd_last.ul];
+	offset = vdir->vd_last.p.deblk - p.deblk;
+	offset += vdir->vd_deblk_sz * vdir->vd_last.ul;
+	return offset;
+}
+
+/* returns true or false */
+static int seek_vdir(struct file *file, struct dir_context *ctx)
+{
+	int valid;
+	unsigned int deblk_sz;
+	unsigned long ul, n;
+	loff_t offset;
+	union au_vdir_deblk_p p, deblk_end;
+	struct au_vdir *vdir_cache;
+
+	valid = 1;
+	vdir_cache = au_fvdir_cache(file);
+	offset = calc_offset(vdir_cache);
+	AuDbg("offset %lld\n", offset);
+	if (ctx->pos == offset)
+		goto out;
+
+	vdir_cache->vd_last.ul = 0;
+	vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0];
+	if (!ctx->pos)
+		goto out;
+
+	valid = 0;
+	deblk_sz = vdir_cache->vd_deblk_sz;
+	ul = div64_u64(ctx->pos, deblk_sz);
+	AuDbg("ul %lu\n", ul);
+	if (ul >= vdir_cache->vd_nblk)
+		goto out;
+
+	n = vdir_cache->vd_nblk;
+	for (; ul < n; ul++) {
+		p.deblk = vdir_cache->vd_deblk[ul];
+		deblk_end.deblk = p.deblk + deblk_sz;
+		offset = ul;
+		offset *= deblk_sz;
+		while (!is_deblk_end(&p, &deblk_end) && offset < ctx->pos) {
+			unsigned int l;
+
+			l = calc_size(p.de->de_str.len);
+			offset += l;
+			p.deblk += l;
+		}
+		if (!is_deblk_end(&p, &deblk_end)) {
+			valid = 1;
+			vdir_cache->vd_last.ul = ul;
+			vdir_cache->vd_last.p = p;
+			break;
+		}
+	}
+
+out:
+	/* smp_mb(); */
+	AuTraceErr(!valid);
+	return valid;
+}
+
+int au_vdir_fill_de(struct file *file, struct dir_context *ctx)
+{
+	unsigned int l, deblk_sz;
+	union au_vdir_deblk_p deblk_end;
+	struct au_vdir *vdir_cache;
+	struct au_vdir_de *de;
+
+	vdir_cache = au_fvdir_cache(file);
+	if (!seek_vdir(file, ctx))
+		return 0;
+
+	deblk_sz = vdir_cache->vd_deblk_sz;
+	while (1) {
+		deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul];
+		deblk_end.deblk += deblk_sz;
+		while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) {
+			de = vdir_cache->vd_last.p.de;
+			AuDbg("%.*s, off%lld, i%lu, dt%d\n",
+			      de->de_str.len, de->de_str.name, ctx->pos,
+			      (unsigned long)de->de_ino, de->de_type);
+			if (unlikely(!dir_emit(ctx, de->de_str.name,
+					       de->de_str.len, de->de_ino,
+					       de->de_type))) {
+				/* todo: ignore the error caused by udba? */
+				/* return err; */
+				return 0;
+			}
+
+			l = calc_size(de->de_str.len);
+			vdir_cache->vd_last.p.deblk += l;
+			ctx->pos += l;
+		}
+		if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) {
+			vdir_cache->vd_last.ul++;
+			vdir_cache->vd_last.p.deblk
+				= vdir_cache->vd_deblk[vdir_cache->vd_last.ul];
+			ctx->pos = deblk_sz * vdir_cache->vd_last.ul;
+			continue;
+		}
+		break;
+	}
+
+	/* smp_mb(); */
+	return 0;
+}
diff --git b/fs/aufs/vfsub.c b/fs/aufs/vfsub.c
new file mode 100644
index 0000000..29e0bbc
--- /dev/null
+++ b/fs/aufs/vfsub.c
@@ -0,0 +1,871 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sub-routines for VFS
+ */
+
+#include <linux/namei.h>
+#include <linux/nsproxy.h>
+#include <linux/security.h>
+#include <linux/splice.h>
+#include "../fs/mount.h"
+#include "aufs.h"
+
+#ifdef CONFIG_AUFS_BR_FUSE
+int vfsub_test_mntns(struct vfsmount *mnt, struct super_block *h_sb)
+{
+	struct nsproxy *ns;
+
+	if (!au_test_fuse(h_sb) || !au_userns)
+		return 0;
+
+	ns = current->nsproxy;
+	/* no {get,put}_nsproxy(ns) */
+	return real_mount(mnt)->mnt_ns == ns->mnt_ns ? 0 : -EACCES;
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+int vfsub_update_h_iattr(struct path *h_path, int *did)
+{
+	int err;
+	struct kstat st;
+	struct super_block *h_sb;
+
+	/* for remote fs, leave work for its getattr or d_revalidate */
+	/* for bad i_attr fs, handle them in aufs_getattr() */
+	/* still some fs may acquire i_mutex. we need to skip them */
+	err = 0;
+	if (!did)
+		did = &err;
+	h_sb = h_path->dentry->d_sb;
+	*did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb));
+	if (*did)
+		err = vfs_getattr(h_path, &st);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct file *vfsub_dentry_open(struct path *path, int flags)
+{
+	struct file *file;
+
+	file = dentry_open(path, flags /* | __FMODE_NONOTIFY */,
+			   current_cred());
+	if (!IS_ERR_OR_NULL(file)
+	    && (file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_inc(d_inode(path->dentry));
+
+	return file;
+}
+
+struct file *vfsub_filp_open(const char *path, int oflags, int mode)
+{
+	struct file *file;
+
+	lockdep_off();
+	file = filp_open(path,
+			 oflags /* | __FMODE_NONOTIFY */,
+			 mode);
+	lockdep_on();
+	if (IS_ERR(file))
+		goto out;
+	vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
+
+out:
+	return file;
+}
+
+/*
+ * Ideally this function should call VFS:do_last() in order to keep all its
+ * checkings. But it is very hard for aufs to regenerate several VFS internal
+ * structure such as nameidata. This is a second (or third) best approach.
+ * cf. linux/fs/namei.c:do_last(), lookup_open() and atomic_open().
+ */
+int vfsub_atomic_open(struct inode *dir, struct dentry *dentry,
+		      struct vfsub_aopen_args *args, struct au_branch *br)
+{
+	int err;
+	struct file *file = args->file;
+	/* copied from linux/fs/namei.c:atomic_open() */
+	struct dentry *const DENTRY_NOT_SET = (void *)-1UL;
+
+	IMustLock(dir);
+	AuDebugOn(!dir->i_op->atomic_open);
+
+	err = au_br_test_oflag(args->open_flag, br);
+	if (unlikely(err))
+		goto out;
+
+	args->file->f_path.dentry = DENTRY_NOT_SET;
+	args->file->f_path.mnt = au_br_mnt(br);
+	err = dir->i_op->atomic_open(dir, dentry, file, args->open_flag,
+				     args->create_mode, args->opened);
+	if (err >= 0) {
+		/* some filesystems don't set FILE_CREATED while succeeded? */
+		if (*args->opened & FILE_CREATED)
+			fsnotify_create(dir, dentry);
+	} else
+		goto out;
+
+
+	if (!err) {
+		/* todo: call VFS:may_open() here */
+		err = open_check_o_direct(file);
+		/* todo: ima_file_check() too? */
+		if (!err && (args->open_flag & __FMODE_EXEC))
+			err = deny_write_access(file);
+		if (unlikely(err))
+			/* note that the file is created and still opened */
+			goto out;
+	}
+
+	atomic_inc(&br->br_count);
+	fsnotify_open(file);
+
+out:
+	return err;
+}
+
+int vfsub_kern_path(const char *name, unsigned int flags, struct path *path)
+{
+	int err;
+
+	err = kern_path(name, flags, path);
+	if (!err && d_is_positive(path->dentry))
+		vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+struct dentry *vfsub_lookup_one_len_unlocked(const char *name,
+					     struct dentry *parent, int len)
+{
+	struct path path = {
+		.mnt = NULL
+	};
+
+	path.dentry = lookup_one_len_unlocked(name, parent, len);
+	if (IS_ERR(path.dentry))
+		goto out;
+	if (d_is_positive(path.dentry))
+		vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/
+
+out:
+	AuTraceErrPtr(path.dentry);
+	return path.dentry;
+}
+
+struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent,
+				    int len)
+{
+	struct path path = {
+		.mnt = NULL
+	};
+
+	/* VFS checks it too, but by WARN_ON_ONCE() */
+	IMustLock(d_inode(parent));
+
+	path.dentry = lookup_one_len(name, parent, len);
+	if (IS_ERR(path.dentry))
+		goto out;
+	if (d_is_positive(path.dentry))
+		vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/
+
+out:
+	AuTraceErrPtr(path.dentry);
+	return path.dentry;
+}
+
+void vfsub_call_lkup_one(void *args)
+{
+	struct vfsub_lkup_one_args *a = args;
+	*a->errp = vfsub_lkup_one(a->name, a->parent);
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1,
+				 struct dentry *d2, struct au_hinode *hdir2)
+{
+	struct dentry *d;
+
+	lockdep_off();
+	d = lock_rename(d1, d2);
+	lockdep_on();
+	au_hn_suspend(hdir1);
+	if (hdir1 != hdir2)
+		au_hn_suspend(hdir2);
+
+	return d;
+}
+
+void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1,
+			 struct dentry *d2, struct au_hinode *hdir2)
+{
+	au_hn_resume(hdir1);
+	if (hdir1 != hdir2)
+		au_hn_resume(hdir2);
+	lockdep_off();
+	unlock_rename(d1, d2);
+	lockdep_on();
+}
+
+/* ---------------------------------------------------------------------- */
+
+int vfsub_create(struct inode *dir, struct path *path, int mode, bool want_excl)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_mknod(path, d, mode, 0);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_create(dir, path->dentry, mode, want_excl);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = *path;
+		int did;
+
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = path->dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+int vfsub_symlink(struct inode *dir, struct path *path, const char *symname)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_symlink(path, d, symname);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_symlink(dir, path->dentry, symname);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = *path;
+		int did;
+
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = path->dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_mknod(path, d, mode, new_encode_dev(dev));
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_mknod(dir, path->dentry, mode, dev);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = *path;
+		int did;
+
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = path->dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+static int au_test_nlink(struct inode *inode)
+{
+	const unsigned int link_max = UINT_MAX >> 1; /* rough margin */
+
+	if (!au_test_fs_no_limit_nlink(inode->i_sb)
+	    || inode->i_nlink < link_max)
+		return 0;
+	return -EMLINK;
+}
+
+int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path,
+	       struct inode **delegated_inode)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	err = au_test_nlink(d_inode(src_dentry));
+	if (unlikely(err))
+		return err;
+
+	/* we don't call may_linkat() */
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_link(src_dentry, path, d);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_link(src_dentry, dir, path->dentry, delegated_inode);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = *path;
+		int did;
+
+		/* fuse has different memory inode for the same inumber */
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = path->dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+			tmp.dentry = src_dentry;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry,
+		 struct inode *dir, struct path *path,
+		 struct inode **delegated_inode)
+{
+	int err;
+	struct path tmp = {
+		.mnt	= path->mnt
+	};
+	struct dentry *d;
+
+	IMustLock(dir);
+	IMustLock(src_dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	tmp.dentry = src_dentry->d_parent;
+	err = security_path_rename(&tmp, src_dentry, path, d, /*flags*/0);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_rename(src_dir, src_dentry, dir, path->dentry,
+			 delegated_inode, /*flags*/0);
+	lockdep_on();
+	if (!err) {
+		int did;
+
+		tmp.dentry = d->d_parent;
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = src_dentry;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+			tmp.dentry = src_dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+int vfsub_mkdir(struct inode *dir, struct path *path, int mode)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_mkdir(path, d, mode);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_mkdir(dir, path->dentry, mode);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = *path;
+		int did;
+
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = path->dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+int vfsub_rmdir(struct inode *dir, struct path *path)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_rmdir(path, d);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_rmdir(dir, path->dentry);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = {
+			.dentry	= path->dentry->d_parent,
+			.mnt	= path->mnt
+		};
+
+		vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/
+	}
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* todo: support mmap_sem? */
+ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count,
+		     loff_t *ppos)
+{
+	ssize_t err;
+
+	lockdep_off();
+	err = vfs_read(file, ubuf, count, ppos);
+	lockdep_on();
+	if (err >= 0)
+		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+/* todo: kernel_read()? */
+ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count,
+		     loff_t *ppos)
+{
+	ssize_t err;
+	mm_segment_t oldfs;
+	union {
+		void *k;
+		char __user *u;
+	} buf;
+
+	buf.k = kbuf;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = vfsub_read_u(file, buf.u, count, ppos);
+	set_fs(oldfs);
+	return err;
+}
+
+ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count,
+		      loff_t *ppos)
+{
+	ssize_t err;
+
+	lockdep_off();
+	err = vfs_write(file, ubuf, count, ppos);
+	lockdep_on();
+	if (err >= 0)
+		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos)
+{
+	ssize_t err;
+	mm_segment_t oldfs;
+	union {
+		void *k;
+		const char __user *u;
+	} buf;
+
+	buf.k = kbuf;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = vfsub_write_u(file, buf.u, count, ppos);
+	set_fs(oldfs);
+	return err;
+}
+
+int vfsub_flush(struct file *file, fl_owner_t id)
+{
+	int err;
+
+	err = 0;
+	if (file->f_op->flush) {
+		if (!au_test_nfs(file->f_path.dentry->d_sb))
+			err = file->f_op->flush(file, id);
+		else {
+			lockdep_off();
+			err = file->f_op->flush(file, id);
+			lockdep_on();
+		}
+		if (!err)
+			vfsub_update_h_iattr(&file->f_path, /*did*/NULL);
+		/*ignore*/
+	}
+	return err;
+}
+
+int vfsub_iterate_dir(struct file *file, struct dir_context *ctx)
+{
+	int err;
+
+	AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos);
+
+	lockdep_off();
+	err = iterate_dir(file, ctx);
+	lockdep_on();
+	if (err >= 0)
+		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+long vfsub_splice_to(struct file *in, loff_t *ppos,
+		     struct pipe_inode_info *pipe, size_t len,
+		     unsigned int flags)
+{
+	long err;
+
+	lockdep_off();
+	err = do_splice_to(in, ppos, pipe, len, flags);
+	lockdep_on();
+	file_accessed(in);
+	if (err >= 0)
+		vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out,
+		       loff_t *ppos, size_t len, unsigned int flags)
+{
+	long err;
+
+	lockdep_off();
+	err = do_splice_from(pipe, out, ppos, len, flags);
+	lockdep_on();
+	if (err >= 0)
+		vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+int vfsub_fsync(struct file *file, struct path *path, int datasync)
+{
+	int err;
+
+	/* file can be NULL */
+	lockdep_off();
+	err = vfs_fsync(file, datasync);
+	lockdep_on();
+	if (!err) {
+		if (!path) {
+			AuDebugOn(!file);
+			path = &file->f_path;
+		}
+		vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/
+	}
+	return err;
+}
+
+/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */
+int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr,
+		struct file *h_file)
+{
+	int err;
+	struct inode *h_inode;
+	struct super_block *h_sb;
+
+	if (!h_file) {
+		err = vfsub_truncate(h_path, length);
+		goto out;
+	}
+
+	h_inode = d_inode(h_path->dentry);
+	h_sb = h_inode->i_sb;
+	lockdep_off();
+	sb_start_write(h_sb);
+	lockdep_on();
+	err = locks_verify_truncate(h_inode, h_file, length);
+	if (!err)
+		err = security_path_truncate(h_path);
+	if (!err) {
+		lockdep_off();
+		err = do_truncate(h_path->dentry, length, attr, h_file);
+		lockdep_on();
+	}
+	lockdep_off();
+	sb_end_write(h_sb);
+	lockdep_on();
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_vfsub_mkdir_args {
+	int *errp;
+	struct inode *dir;
+	struct path *path;
+	int mode;
+};
+
+static void au_call_vfsub_mkdir(void *args)
+{
+	struct au_vfsub_mkdir_args *a = args;
+	*a->errp = vfsub_mkdir(a->dir, a->path, a->mode);
+}
+
+int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode)
+{
+	int err, do_sio, wkq_err;
+
+	do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE);
+	if (!do_sio) {
+		lockdep_off();
+		err = vfsub_mkdir(dir, path, mode);
+		lockdep_on();
+	} else {
+		struct au_vfsub_mkdir_args args = {
+			.errp	= &err,
+			.dir	= dir,
+			.path	= path,
+			.mode	= mode
+		};
+		wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	return err;
+}
+
+struct au_vfsub_rmdir_args {
+	int *errp;
+	struct inode *dir;
+	struct path *path;
+};
+
+static void au_call_vfsub_rmdir(void *args)
+{
+	struct au_vfsub_rmdir_args *a = args;
+	*a->errp = vfsub_rmdir(a->dir, a->path);
+}
+
+int vfsub_sio_rmdir(struct inode *dir, struct path *path)
+{
+	int err, do_sio, wkq_err;
+
+	do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE);
+	if (!do_sio) {
+		lockdep_off();
+		err = vfsub_rmdir(dir, path);
+		lockdep_on();
+	} else {
+		struct au_vfsub_rmdir_args args = {
+			.errp	= &err,
+			.dir	= dir,
+			.path	= path
+		};
+		wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct notify_change_args {
+	int *errp;
+	struct path *path;
+	struct iattr *ia;
+	struct inode **delegated_inode;
+};
+
+static void call_notify_change(void *args)
+{
+	struct notify_change_args *a = args;
+	struct inode *h_inode;
+
+	h_inode = d_inode(a->path->dentry);
+	IMustLock(h_inode);
+
+	*a->errp = -EPERM;
+	if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) {
+		lockdep_off();
+		*a->errp = notify_change(a->path->dentry, a->ia,
+					 a->delegated_inode);
+		lockdep_on();
+		if (!*a->errp)
+			vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/
+	}
+	AuTraceErr(*a->errp);
+}
+
+int vfsub_notify_change(struct path *path, struct iattr *ia,
+			struct inode **delegated_inode)
+{
+	int err;
+	struct notify_change_args args = {
+		.errp			= &err,
+		.path			= path,
+		.ia			= ia,
+		.delegated_inode	= delegated_inode
+	};
+
+	call_notify_change(&args);
+
+	return err;
+}
+
+int vfsub_sio_notify_change(struct path *path, struct iattr *ia,
+			    struct inode **delegated_inode)
+{
+	int err, wkq_err;
+	struct notify_change_args args = {
+		.errp			= &err,
+		.path			= path,
+		.ia			= ia,
+		.delegated_inode	= delegated_inode
+	};
+
+	wkq_err = au_wkq_wait(call_notify_change, &args);
+	if (unlikely(wkq_err))
+		err = wkq_err;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct unlink_args {
+	int *errp;
+	struct inode *dir;
+	struct path *path;
+	struct inode **delegated_inode;
+};
+
+static void call_unlink(void *args)
+{
+	struct unlink_args *a = args;
+	struct dentry *d = a->path->dentry;
+	struct inode *h_inode;
+	const int stop_sillyrename = (au_test_nfs(d->d_sb)
+				      && au_dcount(d) == 1);
+
+	IMustLock(a->dir);
+
+	a->path->dentry = d->d_parent;
+	*a->errp = security_path_unlink(a->path, d);
+	a->path->dentry = d;
+	if (unlikely(*a->errp))
+		return;
+
+	if (!stop_sillyrename)
+		dget(d);
+	h_inode = NULL;
+	if (d_is_positive(d)) {
+		h_inode = d_inode(d);
+		ihold(h_inode);
+	}
+
+	lockdep_off();
+	*a->errp = vfs_unlink(a->dir, d, a->delegated_inode);
+	lockdep_on();
+	if (!*a->errp) {
+		struct path tmp = {
+			.dentry = d->d_parent,
+			.mnt	= a->path->mnt
+		};
+		vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/
+	}
+
+	if (!stop_sillyrename)
+		dput(d);
+	if (h_inode)
+		iput(h_inode);
+
+	AuTraceErr(*a->errp);
+}
+
+/*
+ * @dir: must be locked.
+ * @dentry: target dentry.
+ */
+int vfsub_unlink(struct inode *dir, struct path *path,
+		 struct inode **delegated_inode, int force)
+{
+	int err;
+	struct unlink_args args = {
+		.errp			= &err,
+		.dir			= dir,
+		.path			= path,
+		.delegated_inode	= delegated_inode
+	};
+
+	if (!force)
+		call_unlink(&args);
+	else {
+		int wkq_err;
+
+		wkq_err = au_wkq_wait(call_unlink, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	return err;
+}
diff --git b/fs/aufs/vfsub.h b/fs/aufs/vfsub.h
new file mode 100644
index 0000000..4e08d33
--- /dev/null
+++ b/fs/aufs/vfsub.h
@@ -0,0 +1,297 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sub-routines for VFS
+ */
+
+#ifndef __AUFS_VFSUB_H__
+#define __AUFS_VFSUB_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/posix_acl.h>
+#include <linux/xattr.h>
+#include "debug.h"
+
+/* copied from linux/fs/internal.h */
+/* todo: BAD approach!! */
+extern void __mnt_drop_write(struct vfsmount *);
+extern int open_check_o_direct(struct file *f);
+
+/* ---------------------------------------------------------------------- */
+
+/* lock subclass for lower inode */
+/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */
+/* reduce? gave up. */
+enum {
+	AuLsc_I_Begin = I_MUTEX_PARENT2, /* 5 */
+	AuLsc_I_PARENT,		/* lower inode, parent first */
+	AuLsc_I_PARENT2,	/* copyup dirs */
+	AuLsc_I_PARENT3,	/* copyup wh */
+	AuLsc_I_CHILD,
+	AuLsc_I_CHILD2,
+	AuLsc_I_End
+};
+
+/* to debug easier, do not make them inlined functions */
+#define MtxMustLock(mtx)	AuDebugOn(!mutex_is_locked(mtx))
+#define IMustLock(i)		AuDebugOn(!inode_is_locked(i))
+
+/* ---------------------------------------------------------------------- */
+
+static inline void vfsub_drop_nlink(struct inode *inode)
+{
+	AuDebugOn(!inode->i_nlink);
+	drop_nlink(inode);
+}
+
+static inline void vfsub_dead_dir(struct inode *inode)
+{
+	AuDebugOn(!S_ISDIR(inode->i_mode));
+	inode->i_flags |= S_DEAD;
+	clear_nlink(inode);
+}
+
+static inline int vfsub_native_ro(struct inode *inode)
+{
+	return (inode->i_sb->s_flags & MS_RDONLY)
+		|| IS_RDONLY(inode)
+		/* || IS_APPEND(inode) */
+		|| IS_IMMUTABLE(inode);
+}
+
+#ifdef CONFIG_AUFS_BR_FUSE
+int vfsub_test_mntns(struct vfsmount *mnt, struct super_block *h_sb);
+#else
+AuStubInt0(vfsub_test_mntns, struct vfsmount *mnt, struct super_block *h_sb);
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+int vfsub_update_h_iattr(struct path *h_path, int *did);
+struct file *vfsub_dentry_open(struct path *path, int flags);
+struct file *vfsub_filp_open(const char *path, int oflags, int mode);
+struct vfsub_aopen_args {
+	struct file	*file;
+	unsigned int	open_flag;
+	umode_t		create_mode;
+	int		*opened;
+};
+struct au_branch;
+int vfsub_atomic_open(struct inode *dir, struct dentry *dentry,
+		      struct vfsub_aopen_args *args, struct au_branch *br);
+int vfsub_kern_path(const char *name, unsigned int flags, struct path *path);
+
+struct dentry *vfsub_lookup_one_len_unlocked(const char *name,
+					     struct dentry *parent, int len);
+struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent,
+				    int len);
+
+struct vfsub_lkup_one_args {
+	struct dentry **errp;
+	struct qstr *name;
+	struct dentry *parent;
+};
+
+static inline struct dentry *vfsub_lkup_one(struct qstr *name,
+					    struct dentry *parent)
+{
+	return vfsub_lookup_one_len(name->name, parent, name->len);
+}
+
+void vfsub_call_lkup_one(void *args);
+
+/* ---------------------------------------------------------------------- */
+
+static inline int vfsub_mnt_want_write(struct vfsmount *mnt)
+{
+	int err;
+
+	lockdep_off();
+	err = mnt_want_write(mnt);
+	lockdep_on();
+	return err;
+}
+
+static inline void vfsub_mnt_drop_write(struct vfsmount *mnt)
+{
+	lockdep_off();
+	mnt_drop_write(mnt);
+	lockdep_on();
+}
+
+#if 0 /* reserved */
+static inline void vfsub_mnt_drop_write_file(struct file *file)
+{
+	lockdep_off();
+	mnt_drop_write_file(file);
+	lockdep_on();
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+struct au_hinode;
+struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1,
+				 struct dentry *d2, struct au_hinode *hdir2);
+void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1,
+			 struct dentry *d2, struct au_hinode *hdir2);
+
+int vfsub_create(struct inode *dir, struct path *path, int mode,
+		 bool want_excl);
+int vfsub_symlink(struct inode *dir, struct path *path,
+		  const char *symname);
+int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev);
+int vfsub_link(struct dentry *src_dentry, struct inode *dir,
+	       struct path *path, struct inode **delegated_inode);
+int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry,
+		 struct inode *hdir, struct path *path,
+		 struct inode **delegated_inode);
+int vfsub_mkdir(struct inode *dir, struct path *path, int mode);
+int vfsub_rmdir(struct inode *dir, struct path *path);
+
+/* ---------------------------------------------------------------------- */
+
+ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count,
+		     loff_t *ppos);
+ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count,
+			loff_t *ppos);
+ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count,
+		      loff_t *ppos);
+ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count,
+		      loff_t *ppos);
+int vfsub_flush(struct file *file, fl_owner_t id);
+int vfsub_iterate_dir(struct file *file, struct dir_context *ctx);
+
+static inline loff_t vfsub_f_size_read(struct file *file)
+{
+	return i_size_read(file_inode(file));
+}
+
+static inline unsigned int vfsub_file_flags(struct file *file)
+{
+	unsigned int flags;
+
+	spin_lock(&file->f_lock);
+	flags = file->f_flags;
+	spin_unlock(&file->f_lock);
+
+	return flags;
+}
+
+#if 0 /* reserved */
+static inline void vfsub_file_accessed(struct file *h_file)
+{
+	file_accessed(h_file);
+	vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/
+}
+#endif
+
+#if 0 /* reserved */
+static inline void vfsub_touch_atime(struct vfsmount *h_mnt,
+				     struct dentry *h_dentry)
+{
+	struct path h_path = {
+		.dentry	= h_dentry,
+		.mnt	= h_mnt
+	};
+	touch_atime(&h_path);
+	vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/
+}
+#endif
+
+static inline int vfsub_update_time(struct inode *h_inode, struct timespec *ts,
+				    int flags)
+{
+	return generic_update_time(h_inode, ts, flags);
+	/* no vfsub_update_h_iattr() since we don't have struct path */
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+static inline int vfsub_acl_chmod(struct inode *h_inode, umode_t h_mode)
+{
+	int err;
+
+	err = posix_acl_chmod(h_inode, h_mode);
+	if (err == -EOPNOTSUPP)
+		err = 0;
+	return err;
+}
+#else
+AuStubInt0(vfsub_acl_chmod, struct inode *h_inode, umode_t h_mode);
+#endif
+
+long vfsub_splice_to(struct file *in, loff_t *ppos,
+		     struct pipe_inode_info *pipe, size_t len,
+		     unsigned int flags);
+long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out,
+		       loff_t *ppos, size_t len, unsigned int flags);
+
+static inline long vfsub_truncate(struct path *path, loff_t length)
+{
+	long err;
+
+	lockdep_off();
+	err = vfs_truncate(path, length);
+	lockdep_on();
+	return err;
+}
+
+int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr,
+		struct file *h_file);
+int vfsub_fsync(struct file *file, struct path *path, int datasync);
+
+/* ---------------------------------------------------------------------- */
+
+static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t err;
+
+	lockdep_off();
+	err = vfs_llseek(file, offset, origin);
+	lockdep_on();
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode);
+int vfsub_sio_rmdir(struct inode *dir, struct path *path);
+int vfsub_sio_notify_change(struct path *path, struct iattr *ia,
+			    struct inode **delegated_inode);
+int vfsub_notify_change(struct path *path, struct iattr *ia,
+			struct inode **delegated_inode);
+int vfsub_unlink(struct inode *dir, struct path *path,
+		 struct inode **delegated_inode, int force);
+
+/* ---------------------------------------------------------------------- */
+
+static inline int vfsub_setxattr(struct dentry *dentry, const char *name,
+				 const void *value, size_t size, int flags)
+{
+	int err;
+
+	lockdep_off();
+	err = vfs_setxattr(dentry, name, value, size, flags);
+	lockdep_on();
+
+	return err;
+}
+
+static inline int vfsub_removexattr(struct dentry *dentry, const char *name)
+{
+	int err;
+
+	lockdep_off();
+	err = vfs_removexattr(dentry, name);
+	lockdep_on();
+
+	return err;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_VFSUB_H__ */
diff --git b/fs/aufs/wbr_policy.c b/fs/aufs/wbr_policy.c
new file mode 100644
index 0000000..9943163
--- /dev/null
+++ b/fs/aufs/wbr_policy.c
@@ -0,0 +1,752 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * policies for selecting one among multiple writable branches
+ */
+
+#include <linux/statfs.h>
+#include "aufs.h"
+
+/* subset of cpup_attr() */
+static noinline_for_stack
+int au_cpdown_attr(struct path *h_path, struct dentry *h_src)
+{
+	int err, sbits;
+	struct iattr ia;
+	struct inode *h_isrc;
+
+	h_isrc = d_inode(h_src);
+	ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID;
+	ia.ia_mode = h_isrc->i_mode;
+	ia.ia_uid = h_isrc->i_uid;
+	ia.ia_gid = h_isrc->i_gid;
+	sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID));
+	au_cpup_attr_flags(d_inode(h_path->dentry), h_isrc->i_flags);
+	/* no delegation since it is just created */
+	err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL);
+
+	/* is this nfs only? */
+	if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) {
+		ia.ia_valid = ATTR_FORCE | ATTR_MODE;
+		ia.ia_mode = h_isrc->i_mode;
+		err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL);
+	}
+
+	return err;
+}
+
+#define AuCpdown_PARENT_OPQ	1
+#define AuCpdown_WHED		(1 << 1)
+#define AuCpdown_MADE_DIR	(1 << 2)
+#define AuCpdown_DIROPQ		(1 << 3)
+#define au_ftest_cpdown(flags, name)	((flags) & AuCpdown_##name)
+#define au_fset_cpdown(flags, name) \
+	do { (flags) |= AuCpdown_##name; } while (0)
+#define au_fclr_cpdown(flags, name) \
+	do { (flags) &= ~AuCpdown_##name; } while (0)
+
+static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst,
+			     unsigned int *flags)
+{
+	int err;
+	struct dentry *opq_dentry;
+
+	opq_dentry = au_diropq_create(dentry, bdst);
+	err = PTR_ERR(opq_dentry);
+	if (IS_ERR(opq_dentry))
+		goto out;
+	dput(opq_dentry);
+	au_fset_cpdown(*flags, DIROPQ);
+
+out:
+	return err;
+}
+
+static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent,
+			    struct inode *dir, aufs_bindex_t bdst)
+{
+	int err;
+	struct path h_path;
+	struct au_branch *br;
+
+	br = au_sbr(dentry->d_sb, bdst);
+	h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br);
+	err = PTR_ERR(h_path.dentry);
+	if (IS_ERR(h_path.dentry))
+		goto out;
+
+	err = 0;
+	if (d_is_positive(h_path.dentry)) {
+		h_path.mnt = au_br_mnt(br);
+		err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path,
+					  dentry);
+	}
+	dput(h_path.dentry);
+
+out:
+	return err;
+}
+
+static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst,
+			 struct au_pin *pin,
+			 struct dentry *h_parent, void *arg)
+{
+	int err, rerr;
+	aufs_bindex_t bopq, bstart;
+	struct path h_path;
+	struct dentry *parent;
+	struct inode *h_dir, *h_inode, *inode, *dir;
+	unsigned int *flags = arg;
+
+	bstart = au_dbstart(dentry);
+	/* dentry is di-locked */
+	parent = dget_parent(dentry);
+	dir = d_inode(parent);
+	h_dir = d_inode(h_parent);
+	AuDebugOn(h_dir != au_h_iptr(dir, bdst));
+	IMustLock(h_dir);
+
+	err = au_lkup_neg(dentry, bdst, /*wh*/0);
+	if (unlikely(err < 0))
+		goto out;
+	h_path.dentry = au_h_dptr(dentry, bdst);
+	h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst);
+	err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path,
+			      S_IRWXU | S_IRUGO | S_IXUGO);
+	if (unlikely(err))
+		goto out_put;
+	au_fset_cpdown(*flags, MADE_DIR);
+
+	bopq = au_dbdiropq(dentry);
+	au_fclr_cpdown(*flags, WHED);
+	au_fclr_cpdown(*flags, DIROPQ);
+	if (au_dbwh(dentry) == bdst)
+		au_fset_cpdown(*flags, WHED);
+	if (!au_ftest_cpdown(*flags, PARENT_OPQ) && bopq <= bdst)
+		au_fset_cpdown(*flags, PARENT_OPQ);
+	h_inode = d_inode(h_path.dentry);
+	inode_lock_nested(h_inode, AuLsc_I_CHILD);
+	if (au_ftest_cpdown(*flags, WHED)) {
+		err = au_cpdown_dir_opq(dentry, bdst, flags);
+		if (unlikely(err)) {
+			inode_unlock(h_inode);
+			goto out_dir;
+		}
+	}
+
+	err = au_cpdown_attr(&h_path, au_h_dptr(dentry, bstart));
+	inode_unlock(h_inode);
+	if (unlikely(err))
+		goto out_opq;
+
+	if (au_ftest_cpdown(*flags, WHED)) {
+		err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst);
+		if (unlikely(err))
+			goto out_opq;
+	}
+
+	inode = d_inode(dentry);
+	if (au_ibend(inode) < bdst)
+		au_set_ibend(inode, bdst);
+	au_set_h_iptr(inode, bdst, au_igrab(h_inode),
+		      au_hi_flags(inode, /*isdir*/1));
+	au_fhsm_wrote(dentry->d_sb, bdst, /*force*/0);
+	goto out; /* success */
+
+	/* revert */
+out_opq:
+	if (au_ftest_cpdown(*flags, DIROPQ)) {
+		inode_lock_nested(h_inode, AuLsc_I_CHILD);
+		rerr = au_diropq_remove(dentry, bdst);
+		inode_unlock(h_inode);
+		if (unlikely(rerr)) {
+			AuIOErr("failed removing diropq for %pd b%d (%d)\n",
+				dentry, bdst, rerr);
+			err = -EIO;
+			goto out;
+		}
+	}
+out_dir:
+	if (au_ftest_cpdown(*flags, MADE_DIR)) {
+		rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path);
+		if (unlikely(rerr)) {
+			AuIOErr("failed removing %pd b%d (%d)\n",
+				dentry, bdst, rerr);
+			err = -EIO;
+		}
+	}
+out_put:
+	au_set_h_dptr(dentry, bdst, NULL);
+	if (au_dbend(dentry) == bdst)
+		au_update_dbend(dentry);
+out:
+	dput(parent);
+	return err;
+}
+
+int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst)
+{
+	int err;
+	unsigned int flags;
+
+	flags = 0;
+	err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &flags);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* policies for create */
+
+int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	int err, i, j, ndentry;
+	aufs_bindex_t bopq;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry **dentries, *parent, *d;
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	parent = dget_parent(dentry);
+	err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/0);
+	if (unlikely(err))
+		goto out_free;
+
+	err = bindex;
+	for (i = 0; i < dpages.ndpage; i++) {
+		dpage = dpages.dpages + i;
+		dentries = dpage->dentries;
+		ndentry = dpage->ndentry;
+		for (j = 0; j < ndentry; j++) {
+			d = dentries[j];
+			di_read_lock_parent2(d, !AuLock_IR);
+			bopq = au_dbdiropq(d);
+			di_read_unlock(d, !AuLock_IR);
+			if (bopq >= 0 && bopq < err)
+				err = bopq;
+		}
+	}
+
+out_free:
+	dput(parent);
+	au_dpages_free(&dpages);
+out:
+	return err;
+}
+
+static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex)
+{
+	for (; bindex >= 0; bindex--)
+		if (!au_br_rdonly(au_sbr(sb, bindex)))
+			return bindex;
+	return -EROFS;
+}
+
+/* top down parent */
+static int au_wbr_create_tdp(struct dentry *dentry,
+			     unsigned int flags __maybe_unused)
+{
+	int err;
+	aufs_bindex_t bstart, bindex;
+	struct super_block *sb;
+	struct dentry *parent, *h_parent;
+
+	sb = dentry->d_sb;
+	bstart = au_dbstart(dentry);
+	err = bstart;
+	if (!au_br_rdonly(au_sbr(sb, bstart)))
+		goto out;
+
+	err = -EROFS;
+	parent = dget_parent(dentry);
+	for (bindex = au_dbstart(parent); bindex < bstart; bindex++) {
+		h_parent = au_h_dptr(parent, bindex);
+		if (!h_parent || d_is_negative(h_parent))
+			continue;
+
+		if (!au_br_rdonly(au_sbr(sb, bindex))) {
+			err = bindex;
+			break;
+		}
+	}
+	dput(parent);
+
+	/* bottom up here */
+	if (unlikely(err < 0)) {
+		err = au_wbr_bu(sb, bstart - 1);
+		if (err >= 0)
+			err = au_wbr_nonopq(dentry, err);
+	}
+
+out:
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* an exception for the policy other than tdp */
+static int au_wbr_create_exp(struct dentry *dentry)
+{
+	int err;
+	aufs_bindex_t bwh, bdiropq;
+	struct dentry *parent;
+
+	err = -1;
+	bwh = au_dbwh(dentry);
+	parent = dget_parent(dentry);
+	bdiropq = au_dbdiropq(parent);
+	if (bwh >= 0) {
+		if (bdiropq >= 0)
+			err = min(bdiropq, bwh);
+		else
+			err = bwh;
+		AuDbg("%d\n", err);
+	} else if (bdiropq >= 0) {
+		err = bdiropq;
+		AuDbg("%d\n", err);
+	}
+	dput(parent);
+
+	if (err >= 0)
+		err = au_wbr_nonopq(dentry, err);
+
+	if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err)))
+		err = -1;
+
+	AuDbg("%d\n", err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* round robin */
+static int au_wbr_create_init_rr(struct super_block *sb)
+{
+	int err;
+
+	err = au_wbr_bu(sb, au_sbend(sb));
+	atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */
+	/* smp_mb(); */
+
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+static int au_wbr_create_rr(struct dentry *dentry, unsigned int flags)
+{
+	int err, nbr;
+	unsigned int u;
+	aufs_bindex_t bindex, bend;
+	struct super_block *sb;
+	atomic_t *next;
+
+	err = au_wbr_create_exp(dentry);
+	if (err >= 0)
+		goto out;
+
+	sb = dentry->d_sb;
+	next = &au_sbi(sb)->si_wbr_rr_next;
+	bend = au_sbend(sb);
+	nbr = bend + 1;
+	for (bindex = 0; bindex <= bend; bindex++) {
+		if (!au_ftest_wbr(flags, DIR)) {
+			err = atomic_dec_return(next) + 1;
+			/* modulo for 0 is meaningless */
+			if (unlikely(!err))
+				err = atomic_dec_return(next) + 1;
+		} else
+			err = atomic_read(next);
+		AuDbg("%d\n", err);
+		u = err;
+		err = u % nbr;
+		AuDbg("%d\n", err);
+		if (!au_br_rdonly(au_sbr(sb, err)))
+			break;
+		err = -EROFS;
+	}
+
+	if (err >= 0)
+		err = au_wbr_nonopq(dentry, err);
+
+out:
+	AuDbg("%d\n", err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* most free space */
+static void au_mfs(struct dentry *dentry, struct dentry *parent)
+{
+	struct super_block *sb;
+	struct au_branch *br;
+	struct au_wbr_mfs *mfs;
+	struct dentry *h_parent;
+	aufs_bindex_t bindex, bend;
+	int err;
+	unsigned long long b, bavail;
+	struct path h_path;
+	/* reduce the stack usage */
+	struct kstatfs *st;
+
+	st = kmalloc(sizeof(*st), GFP_NOFS);
+	if (unlikely(!st)) {
+		AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM);
+		return;
+	}
+
+	bavail = 0;
+	sb = dentry->d_sb;
+	mfs = &au_sbi(sb)->si_wbr_mfs;
+	MtxMustLock(&mfs->mfs_lock);
+	mfs->mfs_bindex = -EROFS;
+	mfs->mfsrr_bytes = 0;
+	if (!parent) {
+		bindex = 0;
+		bend = au_sbend(sb);
+	} else {
+		bindex = au_dbstart(parent);
+		bend = au_dbtaildir(parent);
+	}
+
+	for (; bindex <= bend; bindex++) {
+		if (parent) {
+			h_parent = au_h_dptr(parent, bindex);
+			if (!h_parent || d_is_negative(h_parent))
+				continue;
+		}
+		br = au_sbr(sb, bindex);
+		if (au_br_rdonly(br))
+			continue;
+
+		/* sb->s_root for NFS is unreliable */
+		h_path.mnt = au_br_mnt(br);
+		h_path.dentry = h_path.mnt->mnt_root;
+		err = vfs_statfs(&h_path, st);
+		if (unlikely(err)) {
+			AuWarn1("failed statfs, b%d, %d\n", bindex, err);
+			continue;
+		}
+
+		/* when the available size is equal, select the lower one */
+		BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail)
+			     || sizeof(b) < sizeof(st->f_bsize));
+		b = st->f_bavail * st->f_bsize;
+		br->br_wbr->wbr_bytes = b;
+		if (b >= bavail) {
+			bavail = b;
+			mfs->mfs_bindex = bindex;
+			mfs->mfs_jiffy = jiffies;
+		}
+	}
+
+	mfs->mfsrr_bytes = bavail;
+	AuDbg("b%d\n", mfs->mfs_bindex);
+	kfree(st);
+}
+
+static int au_wbr_create_mfs(struct dentry *dentry, unsigned int flags)
+{
+	int err;
+	struct dentry *parent;
+	struct super_block *sb;
+	struct au_wbr_mfs *mfs;
+
+	err = au_wbr_create_exp(dentry);
+	if (err >= 0)
+		goto out;
+
+	sb = dentry->d_sb;
+	parent = NULL;
+	if (au_ftest_wbr(flags, PARENT))
+		parent = dget_parent(dentry);
+	mfs = &au_sbi(sb)->si_wbr_mfs;
+	mutex_lock(&mfs->mfs_lock);
+	if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire)
+	    || mfs->mfs_bindex < 0
+	    || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex)))
+		au_mfs(dentry, parent);
+	mutex_unlock(&mfs->mfs_lock);
+	err = mfs->mfs_bindex;
+	dput(parent);
+
+	if (err >= 0)
+		err = au_wbr_nonopq(dentry, err);
+
+out:
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+static int au_wbr_create_init_mfs(struct super_block *sb)
+{
+	struct au_wbr_mfs *mfs;
+
+	mfs = &au_sbi(sb)->si_wbr_mfs;
+	mutex_init(&mfs->mfs_lock);
+	mfs->mfs_jiffy = 0;
+	mfs->mfs_bindex = -EROFS;
+
+	return 0;
+}
+
+static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused)
+{
+	mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock);
+	return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* most free space and then round robin */
+static int au_wbr_create_mfsrr(struct dentry *dentry, unsigned int flags)
+{
+	int err;
+	struct au_wbr_mfs *mfs;
+
+	err = au_wbr_create_mfs(dentry, flags);
+	if (err >= 0) {
+		mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs;
+		mutex_lock(&mfs->mfs_lock);
+		if (mfs->mfsrr_bytes < mfs->mfsrr_watermark)
+			err = au_wbr_create_rr(dentry, flags);
+		mutex_unlock(&mfs->mfs_lock);
+	}
+
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+static int au_wbr_create_init_mfsrr(struct super_block *sb)
+{
+	int err;
+
+	au_wbr_create_init_mfs(sb); /* ignore */
+	err = au_wbr_create_init_rr(sb);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* top down parent and most free space */
+static int au_wbr_create_pmfs(struct dentry *dentry, unsigned int flags)
+{
+	int err, e2;
+	unsigned long long b;
+	aufs_bindex_t bindex, bstart, bend;
+	struct super_block *sb;
+	struct dentry *parent, *h_parent;
+	struct au_branch *br;
+
+	err = au_wbr_create_tdp(dentry, flags);
+	if (unlikely(err < 0))
+		goto out;
+	parent = dget_parent(dentry);
+	bstart = au_dbstart(parent);
+	bend = au_dbtaildir(parent);
+	if (bstart == bend)
+		goto out_parent; /* success */
+
+	e2 = au_wbr_create_mfs(dentry, flags);
+	if (e2 < 0)
+		goto out_parent; /* success */
+
+	/* when the available size is equal, select upper one */
+	sb = dentry->d_sb;
+	br = au_sbr(sb, err);
+	b = br->br_wbr->wbr_bytes;
+	AuDbg("b%d, %llu\n", err, b);
+
+	for (bindex = bstart; bindex <= bend; bindex++) {
+		h_parent = au_h_dptr(parent, bindex);
+		if (!h_parent || d_is_negative(h_parent))
+			continue;
+
+		br = au_sbr(sb, bindex);
+		if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) {
+			b = br->br_wbr->wbr_bytes;
+			err = bindex;
+			AuDbg("b%d, %llu\n", err, b);
+		}
+	}
+
+	if (err >= 0)
+		err = au_wbr_nonopq(dentry, err);
+
+out_parent:
+	dput(parent);
+out:
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * - top down parent
+ * - most free space with parent
+ * - most free space round-robin regardless parent
+ */
+static int au_wbr_create_pmfsrr(struct dentry *dentry, unsigned int flags)
+{
+	int err;
+	unsigned long long watermark;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct au_wbr_mfs *mfs;
+
+	err = au_wbr_create_pmfs(dentry, flags | AuWbr_PARENT);
+	if (unlikely(err < 0))
+		goto out;
+
+	sb = dentry->d_sb;
+	br = au_sbr(sb, err);
+	mfs = &au_sbi(sb)->si_wbr_mfs;
+	mutex_lock(&mfs->mfs_lock);
+	watermark = mfs->mfsrr_watermark;
+	mutex_unlock(&mfs->mfs_lock);
+	if (br->br_wbr->wbr_bytes < watermark)
+		/* regardless the parent dir */
+		err = au_wbr_create_mfsrr(dentry, flags);
+
+out:
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* policies for copyup */
+
+/* top down parent */
+static int au_wbr_copyup_tdp(struct dentry *dentry)
+{
+	return au_wbr_create_tdp(dentry, /*flags, anything is ok*/0);
+}
+
+/* bottom up parent */
+static int au_wbr_copyup_bup(struct dentry *dentry)
+{
+	int err;
+	aufs_bindex_t bindex, bstart;
+	struct dentry *parent, *h_parent;
+	struct super_block *sb;
+
+	err = -EROFS;
+	sb = dentry->d_sb;
+	parent = dget_parent(dentry);
+	bstart = au_dbstart(parent);
+	for (bindex = au_dbstart(dentry); bindex >= bstart; bindex--) {
+		h_parent = au_h_dptr(parent, bindex);
+		if (!h_parent || d_is_negative(h_parent))
+			continue;
+
+		if (!au_br_rdonly(au_sbr(sb, bindex))) {
+			err = bindex;
+			break;
+		}
+	}
+	dput(parent);
+
+	/* bottom up here */
+	if (unlikely(err < 0))
+		err = au_wbr_bu(sb, bstart - 1);
+
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+/* bottom up */
+int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t bstart)
+{
+	int err;
+
+	err = au_wbr_bu(dentry->d_sb, bstart);
+	AuDbg("b%d\n", err);
+	if (err > bstart)
+		err = au_wbr_nonopq(dentry, err);
+
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+static int au_wbr_copyup_bu(struct dentry *dentry)
+{
+	int err;
+	aufs_bindex_t bstart;
+
+	bstart = au_dbstart(dentry);
+	err = au_wbr_do_copyup_bu(dentry, bstart);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_wbr_copyup_operations au_wbr_copyup_ops[] = {
+	[AuWbrCopyup_TDP] = {
+		.copyup	= au_wbr_copyup_tdp
+	},
+	[AuWbrCopyup_BUP] = {
+		.copyup	= au_wbr_copyup_bup
+	},
+	[AuWbrCopyup_BU] = {
+		.copyup	= au_wbr_copyup_bu
+	}
+};
+
+struct au_wbr_create_operations au_wbr_create_ops[] = {
+	[AuWbrCreate_TDP] = {
+		.create	= au_wbr_create_tdp
+	},
+	[AuWbrCreate_RR] = {
+		.create	= au_wbr_create_rr,
+		.init	= au_wbr_create_init_rr
+	},
+	[AuWbrCreate_MFS] = {
+		.create	= au_wbr_create_mfs,
+		.init	= au_wbr_create_init_mfs,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_MFSV] = {
+		.create	= au_wbr_create_mfs,
+		.init	= au_wbr_create_init_mfs,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_MFSRR] = {
+		.create	= au_wbr_create_mfsrr,
+		.init	= au_wbr_create_init_mfsrr,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_MFSRRV] = {
+		.create	= au_wbr_create_mfsrr,
+		.init	= au_wbr_create_init_mfsrr,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_PMFS] = {
+		.create	= au_wbr_create_pmfs,
+		.init	= au_wbr_create_init_mfs,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_PMFSV] = {
+		.create	= au_wbr_create_pmfs,
+		.init	= au_wbr_create_init_mfs,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_PMFSRR] = {
+		.create	= au_wbr_create_pmfsrr,
+		.init	= au_wbr_create_init_mfsrr,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_PMFSRRV] = {
+		.create	= au_wbr_create_pmfsrr,
+		.init	= au_wbr_create_init_mfsrr,
+		.fin	= au_wbr_create_fin_mfs
+	}
+};
diff --git b/fs/aufs/whout.c b/fs/aufs/whout.c
new file mode 100644
index 0000000..fdda85e
--- /dev/null
+++ b/fs/aufs/whout.c
@@ -0,0 +1,1047 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * whiteout for logical deletion and opaque directory
+ */
+
+#include "aufs.h"
+
+#define WH_MASK			S_IRUGO
+
+/*
+ * If a directory contains this file, then it is opaque.  We start with the
+ * .wh. flag so that it is blocked by lookup.
+ */
+static struct qstr diropq_name = QSTR_INIT(AUFS_WH_DIROPQ,
+					   sizeof(AUFS_WH_DIROPQ) - 1);
+
+/*
+ * generate whiteout name, which is NOT terminated by NULL.
+ * @name: original d_name.name
+ * @len: original d_name.len
+ * @wh: whiteout qstr
+ * returns zero when succeeds, otherwise error.
+ * succeeded value as wh->name should be freed by kfree().
+ */
+int au_wh_name_alloc(struct qstr *wh, const struct qstr *name)
+{
+	char *p;
+
+	if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN))
+		return -ENAMETOOLONG;
+
+	wh->len = name->len + AUFS_WH_PFX_LEN;
+	p = kmalloc(wh->len, GFP_NOFS);
+	wh->name = p;
+	if (p) {
+		memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
+		memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len);
+		/* smp_mb(); */
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * test if the @wh_name exists under @h_parent.
+ * @try_sio specifies the necessary of super-io.
+ */
+int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio)
+{
+	int err;
+	struct dentry *wh_dentry;
+
+	if (!try_sio)
+		wh_dentry = vfsub_lkup_one(wh_name, h_parent);
+	else
+		wh_dentry = au_sio_lkup_one(wh_name, h_parent);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry)) {
+		if (err == -ENAMETOOLONG)
+			err = 0;
+		goto out;
+	}
+
+	err = 0;
+	if (d_is_negative(wh_dentry))
+		goto out_wh; /* success */
+
+	err = 1;
+	if (d_is_reg(wh_dentry))
+		goto out_wh; /* success */
+
+	err = -EIO;
+	AuIOErr("%pd Invalid whiteout entry type 0%o.\n",
+		wh_dentry, d_inode(wh_dentry)->i_mode);
+
+out_wh:
+	dput(wh_dentry);
+out:
+	return err;
+}
+
+/*
+ * test if the @h_dentry sets opaque or not.
+ */
+int au_diropq_test(struct dentry *h_dentry)
+{
+	int err;
+	struct inode *h_dir;
+
+	h_dir = d_inode(h_dentry);
+	err = au_wh_test(h_dentry, &diropq_name,
+			 au_test_h_perm_sio(h_dir, MAY_EXEC));
+	return err;
+}
+
+/*
+ * returns a negative dentry whose name is unique and temporary.
+ */
+struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br,
+			     struct qstr *prefix)
+{
+	struct dentry *dentry;
+	int i;
+	char defname[NAME_MAX - AUFS_MAX_NAMELEN + DNAME_INLINE_LEN + 1],
+		*name, *p;
+	/* strict atomic_t is unnecessary here */
+	static unsigned short cnt;
+	struct qstr qs;
+
+	BUILD_BUG_ON(sizeof(cnt) * 2 > AUFS_WH_TMP_LEN);
+
+	name = defname;
+	qs.len = sizeof(defname) - DNAME_INLINE_LEN + prefix->len - 1;
+	if (unlikely(prefix->len > DNAME_INLINE_LEN)) {
+		dentry = ERR_PTR(-ENAMETOOLONG);
+		if (unlikely(qs.len > NAME_MAX))
+			goto out;
+		dentry = ERR_PTR(-ENOMEM);
+		name = kmalloc(qs.len + 1, GFP_NOFS);
+		if (unlikely(!name))
+			goto out;
+	}
+
+	/* doubly whiteout-ed */
+	memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2);
+	p = name + AUFS_WH_PFX_LEN * 2;
+	memcpy(p, prefix->name, prefix->len);
+	p += prefix->len;
+	*p++ = '.';
+	AuDebugOn(name + qs.len + 1 - p <= AUFS_WH_TMP_LEN);
+
+	qs.name = name;
+	for (i = 0; i < 3; i++) {
+		sprintf(p, "%.*x", AUFS_WH_TMP_LEN, cnt++);
+		dentry = au_sio_lkup_one(&qs, h_parent);
+		if (IS_ERR(dentry) || d_is_negative(dentry))
+			goto out_name;
+		dput(dentry);
+	}
+	/* pr_warn("could not get random name\n"); */
+	dentry = ERR_PTR(-EEXIST);
+	AuDbg("%.*s\n", AuLNPair(&qs));
+	BUG();
+
+out_name:
+	if (name != defname)
+		kfree(name);
+out:
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+/*
+ * rename the @h_dentry on @br to the whiteouted temporary name.
+ */
+int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br)
+{
+	int err;
+	struct path h_path = {
+		.mnt = au_br_mnt(br)
+	};
+	struct inode *h_dir, *delegated;
+	struct dentry *h_parent;
+
+	h_parent = h_dentry->d_parent; /* dir inode is locked */
+	h_dir = d_inode(h_parent);
+	IMustLock(h_dir);
+
+	h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name);
+	err = PTR_ERR(h_path.dentry);
+	if (IS_ERR(h_path.dentry))
+		goto out;
+
+	/* under the same dir, no need to lock_rename() */
+	delegated = NULL;
+	err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path, &delegated);
+	AuTraceErr(err);
+	if (unlikely(err == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal rename\n");
+		iput(delegated);
+	}
+	dput(h_path.dentry);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * functions for removing a whiteout
+ */
+
+static int do_unlink_wh(struct inode *h_dir, struct path *h_path)
+{
+	int err, force;
+	struct inode *delegated;
+
+	/*
+	 * forces superio when the dir has a sticky bit.
+	 * this may be a violation of unix fs semantics.
+	 */
+	force = (h_dir->i_mode & S_ISVTX)
+		&& !uid_eq(current_fsuid(), d_inode(h_path->dentry)->i_uid);
+	delegated = NULL;
+	err = vfsub_unlink(h_dir, h_path, &delegated, force);
+	if (unlikely(err == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal unlink\n");
+		iput(delegated);
+	}
+	return err;
+}
+
+int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path,
+			struct dentry *dentry)
+{
+	int err;
+
+	err = do_unlink_wh(h_dir, h_path);
+	if (!err && dentry)
+		au_set_dbwh(dentry, -1);
+
+	return err;
+}
+
+static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh,
+			  struct au_branch *br)
+{
+	int err;
+	struct path h_path = {
+		.mnt = au_br_mnt(br)
+	};
+
+	err = 0;
+	h_path.dentry = vfsub_lkup_one(wh, h_parent);
+	if (IS_ERR(h_path.dentry))
+		err = PTR_ERR(h_path.dentry);
+	else {
+		if (d_is_reg(h_path.dentry))
+			err = do_unlink_wh(d_inode(h_parent), &h_path);
+		dput(h_path.dentry);
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * initialize/clean whiteout for a branch
+ */
+
+static void au_wh_clean(struct inode *h_dir, struct path *whpath,
+			const int isdir)
+{
+	int err;
+	struct inode *delegated;
+
+	if (d_is_negative(whpath->dentry))
+		return;
+
+	if (isdir)
+		err = vfsub_rmdir(h_dir, whpath);
+	else {
+		delegated = NULL;
+		err = vfsub_unlink(h_dir, whpath, &delegated, /*force*/0);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal unlink\n");
+			iput(delegated);
+		}
+	}
+	if (unlikely(err))
+		pr_warn("failed removing %pd (%d), ignored.\n",
+			whpath->dentry, err);
+}
+
+static int test_linkable(struct dentry *h_root)
+{
+	struct inode *h_dir = d_inode(h_root);
+
+	if (h_dir->i_op->link)
+		return 0;
+
+	pr_err("%pd (%s) doesn't support link(2), use noplink and rw+nolwh\n",
+	       h_root, au_sbtype(h_root->d_sb));
+	return -ENOSYS;
+}
+
+/* todo: should this mkdir be done in /sbin/mount.aufs helper? */
+static int au_whdir(struct inode *h_dir, struct path *path)
+{
+	int err;
+
+	err = -EEXIST;
+	if (d_is_negative(path->dentry)) {
+		int mode = S_IRWXU;
+
+		if (au_test_nfs(path->dentry->d_sb))
+			mode |= S_IXUGO;
+		err = vfsub_mkdir(h_dir, path, mode);
+	} else if (d_is_dir(path->dentry))
+		err = 0;
+	else
+		pr_err("unknown %pd exists\n", path->dentry);
+
+	return err;
+}
+
+struct au_wh_base {
+	const struct qstr *name;
+	struct dentry *dentry;
+};
+
+static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[],
+			  struct path *h_path)
+{
+	h_path->dentry = base[AuBrWh_BASE].dentry;
+	au_wh_clean(h_dir, h_path, /*isdir*/0);
+	h_path->dentry = base[AuBrWh_PLINK].dentry;
+	au_wh_clean(h_dir, h_path, /*isdir*/1);
+	h_path->dentry = base[AuBrWh_ORPH].dentry;
+	au_wh_clean(h_dir, h_path, /*isdir*/1);
+}
+
+/*
+ * returns tri-state,
+ * minus: error, caller should print the message
+ * zero: succuess
+ * plus: error, caller should NOT print the message
+ */
+static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr,
+				int do_plink, struct au_wh_base base[],
+				struct path *h_path)
+{
+	int err;
+	struct inode *h_dir;
+
+	h_dir = d_inode(h_root);
+	h_path->dentry = base[AuBrWh_BASE].dentry;
+	au_wh_clean(h_dir, h_path, /*isdir*/0);
+	h_path->dentry = base[AuBrWh_PLINK].dentry;
+	if (do_plink) {
+		err = test_linkable(h_root);
+		if (unlikely(err)) {
+			err = 1;
+			goto out;
+		}
+
+		err = au_whdir(h_dir, h_path);
+		if (unlikely(err))
+			goto out;
+		wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry);
+	} else
+		au_wh_clean(h_dir, h_path, /*isdir*/1);
+	h_path->dentry = base[AuBrWh_ORPH].dentry;
+	err = au_whdir(h_dir, h_path);
+	if (unlikely(err))
+		goto out;
+	wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry);
+
+out:
+	return err;
+}
+
+/*
+ * for the moment, aufs supports the branch filesystem which does not support
+ * link(2). testing on FAT which does not support i_op->setattr() fully either,
+ * copyup failed. finally, such filesystem will not be used as the writable
+ * branch.
+ *
+ * returns tri-state, see above.
+ */
+static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr,
+			 int do_plink, struct au_wh_base base[],
+			 struct path *h_path)
+{
+	int err;
+	struct inode *h_dir;
+
+	WbrWhMustWriteLock(wbr);
+
+	err = test_linkable(h_root);
+	if (unlikely(err)) {
+		err = 1;
+		goto out;
+	}
+
+	/*
+	 * todo: should this create be done in /sbin/mount.aufs helper?
+	 */
+	err = -EEXIST;
+	h_dir = d_inode(h_root);
+	if (d_is_negative(base[AuBrWh_BASE].dentry)) {
+		h_path->dentry = base[AuBrWh_BASE].dentry;
+		err = vfsub_create(h_dir, h_path, WH_MASK, /*want_excl*/true);
+	} else if (d_is_reg(base[AuBrWh_BASE].dentry))
+		err = 0;
+	else
+		pr_err("unknown %pd2 exists\n", base[AuBrWh_BASE].dentry);
+	if (unlikely(err))
+		goto out;
+
+	h_path->dentry = base[AuBrWh_PLINK].dentry;
+	if (do_plink) {
+		err = au_whdir(h_dir, h_path);
+		if (unlikely(err))
+			goto out;
+		wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry);
+	} else
+		au_wh_clean(h_dir, h_path, /*isdir*/1);
+	wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry);
+
+	h_path->dentry = base[AuBrWh_ORPH].dentry;
+	err = au_whdir(h_dir, h_path);
+	if (unlikely(err))
+		goto out;
+	wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry);
+
+out:
+	return err;
+}
+
+/*
+ * initialize the whiteout base file/dir for @br.
+ */
+int au_wh_init(struct au_branch *br, struct super_block *sb)
+{
+	int err, i;
+	const unsigned char do_plink
+		= !!au_opt_test(au_mntflags(sb), PLINK);
+	struct inode *h_dir;
+	struct path path = br->br_path;
+	struct dentry *h_root = path.dentry;
+	struct au_wbr *wbr = br->br_wbr;
+	static const struct qstr base_name[] = {
+		[AuBrWh_BASE] = QSTR_INIT(AUFS_BASE_NAME,
+					  sizeof(AUFS_BASE_NAME) - 1),
+		[AuBrWh_PLINK] = QSTR_INIT(AUFS_PLINKDIR_NAME,
+					   sizeof(AUFS_PLINKDIR_NAME) - 1),
+		[AuBrWh_ORPH] = QSTR_INIT(AUFS_ORPHDIR_NAME,
+					  sizeof(AUFS_ORPHDIR_NAME) - 1)
+	};
+	struct au_wh_base base[] = {
+		[AuBrWh_BASE] = {
+			.name	= base_name + AuBrWh_BASE,
+			.dentry	= NULL
+		},
+		[AuBrWh_PLINK] = {
+			.name	= base_name + AuBrWh_PLINK,
+			.dentry	= NULL
+		},
+		[AuBrWh_ORPH] = {
+			.name	= base_name + AuBrWh_ORPH,
+			.dentry	= NULL
+		}
+	};
+
+	if (wbr)
+		WbrWhMustWriteLock(wbr);
+
+	for (i = 0; i < AuBrWh_Last; i++) {
+		/* doubly whiteouted */
+		struct dentry *d;
+
+		d = au_wh_lkup(h_root, (void *)base[i].name, br);
+		err = PTR_ERR(d);
+		if (IS_ERR(d))
+			goto out;
+
+		base[i].dentry = d;
+		AuDebugOn(wbr
+			  && wbr->wbr_wh[i]
+			  && wbr->wbr_wh[i] != base[i].dentry);
+	}
+
+	if (wbr)
+		for (i = 0; i < AuBrWh_Last; i++) {
+			dput(wbr->wbr_wh[i]);
+			wbr->wbr_wh[i] = NULL;
+		}
+
+	err = 0;
+	if (!au_br_writable(br->br_perm)) {
+		h_dir = d_inode(h_root);
+		au_wh_init_ro(h_dir, base, &path);
+	} else if (!au_br_wh_linkable(br->br_perm)) {
+		err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path);
+		if (err > 0)
+			goto out;
+		else if (err)
+			goto out_err;
+	} else {
+		err = au_wh_init_rw(h_root, wbr, do_plink, base, &path);
+		if (err > 0)
+			goto out;
+		else if (err)
+			goto out_err;
+	}
+	goto out; /* success */
+
+out_err:
+	pr_err("an error(%d) on the writable branch %pd(%s)\n",
+	       err, h_root, au_sbtype(h_root->d_sb));
+out:
+	for (i = 0; i < AuBrWh_Last; i++)
+		dput(base[i].dentry);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * whiteouts are all hard-linked usually.
+ * when its link count reaches a ceiling, we create a new whiteout base
+ * asynchronously.
+ */
+
+struct reinit_br_wh {
+	struct super_block *sb;
+	struct au_branch *br;
+};
+
+static void reinit_br_wh(void *arg)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct path h_path;
+	struct reinit_br_wh *a = arg;
+	struct au_wbr *wbr;
+	struct inode *dir, *delegated;
+	struct dentry *h_root;
+	struct au_hinode *hdir;
+
+	err = 0;
+	wbr = a->br->br_wbr;
+	/* big aufs lock */
+	si_noflush_write_lock(a->sb);
+	if (!au_br_writable(a->br->br_perm))
+		goto out;
+	bindex = au_br_index(a->sb, a->br->br_id);
+	if (unlikely(bindex < 0))
+		goto out;
+
+	di_read_lock_parent(a->sb->s_root, AuLock_IR);
+	dir = d_inode(a->sb->s_root);
+	hdir = au_hi(dir, bindex);
+	h_root = au_h_dptr(a->sb->s_root, bindex);
+	AuDebugOn(h_root != au_br_dentry(a->br));
+
+	au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
+	wbr_wh_write_lock(wbr);
+	err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode,
+			  h_root, a->br);
+	if (!err) {
+		h_path.dentry = wbr->wbr_whbase;
+		h_path.mnt = au_br_mnt(a->br);
+		delegated = NULL;
+		err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated,
+				   /*force*/0);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal unlink\n");
+			iput(delegated);
+		}
+	} else {
+		pr_warn("%pd is moved, ignored\n", wbr->wbr_whbase);
+		err = 0;
+	}
+	dput(wbr->wbr_whbase);
+	wbr->wbr_whbase = NULL;
+	if (!err)
+		err = au_wh_init(a->br, a->sb);
+	wbr_wh_write_unlock(wbr);
+	au_hn_imtx_unlock(hdir);
+	di_read_unlock(a->sb->s_root, AuLock_IR);
+	if (!err)
+		au_fhsm_wrote(a->sb, bindex, /*force*/0);
+
+out:
+	if (wbr)
+		atomic_dec(&wbr->wbr_wh_running);
+	atomic_dec(&a->br->br_count);
+	si_write_unlock(a->sb);
+	au_nwt_done(&au_sbi(a->sb)->si_nowait);
+	kfree(arg);
+	if (unlikely(err))
+		AuIOErr("err %d\n", err);
+}
+
+static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br)
+{
+	int do_dec, wkq_err;
+	struct reinit_br_wh *arg;
+
+	do_dec = 1;
+	if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1)
+		goto out;
+
+	/* ignore ENOMEM */
+	arg = kmalloc(sizeof(*arg), GFP_NOFS);
+	if (arg) {
+		/*
+		 * dec(wh_running), kfree(arg) and dec(br_count)
+		 * in reinit function
+		 */
+		arg->sb = sb;
+		arg->br = br;
+		atomic_inc(&br->br_count);
+		wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb, /*flags*/0);
+		if (unlikely(wkq_err)) {
+			atomic_dec(&br->br_wbr->wbr_wh_running);
+			atomic_dec(&br->br_count);
+			kfree(arg);
+		}
+		do_dec = 0;
+	}
+
+out:
+	if (do_dec)
+		atomic_dec(&br->br_wbr->wbr_wh_running);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * create the whiteout @wh.
+ */
+static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex,
+			     struct dentry *wh)
+{
+	int err;
+	struct path h_path = {
+		.dentry = wh
+	};
+	struct au_branch *br;
+	struct au_wbr *wbr;
+	struct dentry *h_parent;
+	struct inode *h_dir, *delegated;
+
+	h_parent = wh->d_parent; /* dir inode is locked */
+	h_dir = d_inode(h_parent);
+	IMustLock(h_dir);
+
+	br = au_sbr(sb, bindex);
+	h_path.mnt = au_br_mnt(br);
+	wbr = br->br_wbr;
+	wbr_wh_read_lock(wbr);
+	if (wbr->wbr_whbase) {
+		delegated = NULL;
+		err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path, &delegated);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal link\n");
+			iput(delegated);
+		}
+		if (!err || err != -EMLINK)
+			goto out;
+
+		/* link count full. re-initialize br_whbase. */
+		kick_reinit_br_wh(sb, br);
+	}
+
+	/* return this error in this context */
+	err = vfsub_create(h_dir, &h_path, WH_MASK, /*want_excl*/true);
+	if (!err)
+		au_fhsm_wrote(sb, bindex, /*force*/0);
+
+out:
+	wbr_wh_read_unlock(wbr);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * create or remove the diropq.
+ */
+static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex,
+				unsigned int flags)
+{
+	struct dentry *opq_dentry, *h_dentry;
+	struct super_block *sb;
+	struct au_branch *br;
+	int err;
+
+	sb = dentry->d_sb;
+	br = au_sbr(sb, bindex);
+	h_dentry = au_h_dptr(dentry, bindex);
+	opq_dentry = vfsub_lkup_one(&diropq_name, h_dentry);
+	if (IS_ERR(opq_dentry))
+		goto out;
+
+	if (au_ftest_diropq(flags, CREATE)) {
+		err = link_or_create_wh(sb, bindex, opq_dentry);
+		if (!err) {
+			au_set_dbdiropq(dentry, bindex);
+			goto out; /* success */
+		}
+	} else {
+		struct path tmp = {
+			.dentry = opq_dentry,
+			.mnt	= au_br_mnt(br)
+		};
+		err = do_unlink_wh(au_h_iptr(d_inode(dentry), bindex), &tmp);
+		if (!err)
+			au_set_dbdiropq(dentry, -1);
+	}
+	dput(opq_dentry);
+	opq_dentry = ERR_PTR(err);
+
+out:
+	return opq_dentry;
+}
+
+struct do_diropq_args {
+	struct dentry **errp;
+	struct dentry *dentry;
+	aufs_bindex_t bindex;
+	unsigned int flags;
+};
+
+static void call_do_diropq(void *args)
+{
+	struct do_diropq_args *a = args;
+	*a->errp = do_diropq(a->dentry, a->bindex, a->flags);
+}
+
+struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex,
+			     unsigned int flags)
+{
+	struct dentry *diropq, *h_dentry;
+
+	h_dentry = au_h_dptr(dentry, bindex);
+	if (!au_test_h_perm_sio(d_inode(h_dentry), MAY_EXEC | MAY_WRITE))
+		diropq = do_diropq(dentry, bindex, flags);
+	else {
+		int wkq_err;
+		struct do_diropq_args args = {
+			.errp		= &diropq,
+			.dentry		= dentry,
+			.bindex		= bindex,
+			.flags		= flags
+		};
+
+		wkq_err = au_wkq_wait(call_do_diropq, &args);
+		if (unlikely(wkq_err))
+			diropq = ERR_PTR(wkq_err);
+	}
+
+	return diropq;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * lookup whiteout dentry.
+ * @h_parent: lower parent dentry which must exist and be locked
+ * @base_name: name of dentry which will be whiteouted
+ * returns dentry for whiteout.
+ */
+struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name,
+			  struct au_branch *br)
+{
+	int err;
+	struct qstr wh_name;
+	struct dentry *wh_dentry;
+
+	err = au_wh_name_alloc(&wh_name, base_name);
+	wh_dentry = ERR_PTR(err);
+	if (!err) {
+		wh_dentry = vfsub_lkup_one(&wh_name, h_parent);
+		kfree(wh_name.name);
+	}
+	return wh_dentry;
+}
+
+/*
+ * link/create a whiteout for @dentry on @bindex.
+ */
+struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex,
+			    struct dentry *h_parent)
+{
+	struct dentry *wh_dentry;
+	struct super_block *sb;
+	int err;
+
+	sb = dentry->d_sb;
+	wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex));
+	if (!IS_ERR(wh_dentry) && d_is_negative(wh_dentry)) {
+		err = link_or_create_wh(sb, bindex, wh_dentry);
+		if (!err) {
+			au_set_dbwh(dentry, bindex);
+			au_fhsm_wrote(sb, bindex, /*force*/0);
+		} else {
+			dput(wh_dentry);
+			wh_dentry = ERR_PTR(err);
+		}
+	}
+
+	return wh_dentry;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* Delete all whiteouts in this directory on branch bindex. */
+static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist,
+			   aufs_bindex_t bindex, struct au_branch *br)
+{
+	int err;
+	unsigned long ul, n;
+	struct qstr wh_name;
+	char *p;
+	struct hlist_head *head;
+	struct au_vdir_wh *pos;
+	struct au_vdir_destr *str;
+
+	err = -ENOMEM;
+	p = (void *)__get_free_page(GFP_NOFS);
+	wh_name.name = p;
+	if (unlikely(!wh_name.name))
+		goto out;
+
+	err = 0;
+	memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
+	p += AUFS_WH_PFX_LEN;
+	n = whlist->nh_num;
+	head = whlist->nh_head;
+	for (ul = 0; !err && ul < n; ul++, head++) {
+		hlist_for_each_entry(pos, head, wh_hash) {
+			if (pos->wh_bindex != bindex)
+				continue;
+
+			str = &pos->wh_str;
+			if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) {
+				memcpy(p, str->name, str->len);
+				wh_name.len = AUFS_WH_PFX_LEN + str->len;
+				err = unlink_wh_name(h_dentry, &wh_name, br);
+				if (!err)
+					continue;
+				break;
+			}
+			AuIOErr("whiteout name too long %.*s\n",
+				str->len, str->name);
+			err = -EIO;
+			break;
+		}
+	}
+	free_page((unsigned long)wh_name.name);
+
+out:
+	return err;
+}
+
+struct del_wh_children_args {
+	int *errp;
+	struct dentry *h_dentry;
+	struct au_nhash *whlist;
+	aufs_bindex_t bindex;
+	struct au_branch *br;
+};
+
+static void call_del_wh_children(void *args)
+{
+	struct del_wh_children_args *a = args;
+	*a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br);
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp)
+{
+	struct au_whtmp_rmdir *whtmp;
+	int err;
+	unsigned int rdhash;
+
+	SiMustAnyLock(sb);
+
+	whtmp = kzalloc(sizeof(*whtmp), gfp);
+	if (unlikely(!whtmp)) {
+		whtmp = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	/* no estimation for dir size */
+	rdhash = au_sbi(sb)->si_rdhash;
+	if (!rdhash)
+		rdhash = AUFS_RDHASH_DEF;
+	err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp);
+	if (unlikely(err)) {
+		kfree(whtmp);
+		whtmp = ERR_PTR(err);
+	}
+
+out:
+	return whtmp;
+}
+
+void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp)
+{
+	if (whtmp->br)
+		atomic_dec(&whtmp->br->br_count);
+	dput(whtmp->wh_dentry);
+	iput(whtmp->dir);
+	au_nhash_wh_free(&whtmp->whlist);
+	kfree(whtmp);
+}
+
+/*
+ * rmdir the whiteouted temporary named dir @h_dentry.
+ * @whlist: whiteouted children.
+ */
+int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex,
+		   struct dentry *wh_dentry, struct au_nhash *whlist)
+{
+	int err;
+	unsigned int h_nlink;
+	struct path h_tmp;
+	struct inode *wh_inode, *h_dir;
+	struct au_branch *br;
+
+	h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */
+	IMustLock(h_dir);
+
+	br = au_sbr(dir->i_sb, bindex);
+	wh_inode = d_inode(wh_dentry);
+	inode_lock_nested(wh_inode, AuLsc_I_CHILD);
+
+	/*
+	 * someone else might change some whiteouts while we were sleeping.
+	 * it means this whlist may have an obsoleted entry.
+	 */
+	if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE))
+		err = del_wh_children(wh_dentry, whlist, bindex, br);
+	else {
+		int wkq_err;
+		struct del_wh_children_args args = {
+			.errp		= &err,
+			.h_dentry	= wh_dentry,
+			.whlist		= whlist,
+			.bindex		= bindex,
+			.br		= br
+		};
+
+		wkq_err = au_wkq_wait(call_del_wh_children, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+	inode_unlock(wh_inode);
+
+	if (!err) {
+		h_tmp.dentry = wh_dentry;
+		h_tmp.mnt = au_br_mnt(br);
+		h_nlink = h_dir->i_nlink;
+		err = vfsub_rmdir(h_dir, &h_tmp);
+		/* some fs doesn't change the parent nlink in some cases */
+		h_nlink -= h_dir->i_nlink;
+	}
+
+	if (!err) {
+		if (au_ibstart(dir) == bindex) {
+			/* todo: dir->i_mutex is necessary */
+			au_cpup_attr_timesizes(dir);
+			if (h_nlink)
+				vfsub_drop_nlink(dir);
+		}
+		return 0; /* success */
+	}
+
+	pr_warn("failed removing %pd(%d), ignored\n", wh_dentry, err);
+	return err;
+}
+
+static void call_rmdir_whtmp(void *args)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct au_whtmp_rmdir *a = args;
+	struct super_block *sb;
+	struct dentry *h_parent;
+	struct inode *h_dir;
+	struct au_hinode *hdir;
+
+	/* rmdir by nfsd may cause deadlock with this i_mutex */
+	/* inode_lock(a->dir); */
+	err = -EROFS;
+	sb = a->dir->i_sb;
+	si_read_lock(sb, !AuLock_FLUSH);
+	if (!au_br_writable(a->br->br_perm))
+		goto out;
+	bindex = au_br_index(sb, a->br->br_id);
+	if (unlikely(bindex < 0))
+		goto out;
+
+	err = -EIO;
+	ii_write_lock_parent(a->dir);
+	h_parent = dget_parent(a->wh_dentry);
+	h_dir = d_inode(h_parent);
+	hdir = au_hi(a->dir, bindex);
+	err = vfsub_mnt_want_write(au_br_mnt(a->br));
+	if (unlikely(err))
+		goto out_mnt;
+	au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
+	err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent,
+			  a->br);
+	if (!err)
+		err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, &a->whlist);
+	au_hn_imtx_unlock(hdir);
+	vfsub_mnt_drop_write(au_br_mnt(a->br));
+
+out_mnt:
+	dput(h_parent);
+	ii_write_unlock(a->dir);
+out:
+	/* inode_unlock(a->dir); */
+	au_whtmp_rmdir_free(a);
+	si_read_unlock(sb);
+	au_nwt_done(&au_sbi(sb)->si_nowait);
+	if (unlikely(err))
+		AuIOErr("err %d\n", err);
+}
+
+void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex,
+			 struct dentry *wh_dentry, struct au_whtmp_rmdir *args)
+{
+	int wkq_err;
+	struct super_block *sb;
+
+	IMustLock(dir);
+
+	/* all post-process will be done in do_rmdir_whtmp(). */
+	sb = dir->i_sb;
+	args->dir = au_igrab(dir);
+	args->br = au_sbr(sb, bindex);
+	atomic_inc(&args->br->br_count);
+	args->wh_dentry = dget(wh_dentry);
+	wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, sb, /*flags*/0);
+	if (unlikely(wkq_err)) {
+		pr_warn("rmdir error %pd (%d), ignored\n", wh_dentry, wkq_err);
+		au_whtmp_rmdir_free(args);
+	}
+}
diff --git b/fs/aufs/whout.h b/fs/aufs/whout.h
new file mode 100644
index 0000000..4077dd1
--- /dev/null
+++ b/fs/aufs/whout.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * whiteout for logical deletion and opaque directory
+ */
+
+#ifndef __AUFS_WHOUT_H__
+#define __AUFS_WHOUT_H__
+
+#ifdef __KERNEL__
+
+#include "dir.h"
+
+/* whout.c */
+int au_wh_name_alloc(struct qstr *wh, const struct qstr *name);
+int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio);
+int au_diropq_test(struct dentry *h_dentry);
+struct au_branch;
+struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br,
+			     struct qstr *prefix);
+int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br);
+int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path,
+			struct dentry *dentry);
+int au_wh_init(struct au_branch *br, struct super_block *sb);
+
+/* diropq flags */
+#define AuDiropq_CREATE	1
+#define au_ftest_diropq(flags, name)	((flags) & AuDiropq_##name)
+#define au_fset_diropq(flags, name) \
+	do { (flags) |= AuDiropq_##name; } while (0)
+#define au_fclr_diropq(flags, name) \
+	do { (flags) &= ~AuDiropq_##name; } while (0)
+
+struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex,
+			     unsigned int flags);
+struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name,
+			  struct au_branch *br);
+struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex,
+			    struct dentry *h_parent);
+
+/* real rmdir for the whiteout-ed dir */
+struct au_whtmp_rmdir {
+	struct inode *dir;
+	struct au_branch *br;
+	struct dentry *wh_dentry;
+	struct au_nhash whlist;
+};
+
+struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp);
+void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp);
+int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex,
+		   struct dentry *wh_dentry, struct au_nhash *whlist);
+void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex,
+			 struct dentry *wh_dentry, struct au_whtmp_rmdir *args);
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct dentry *au_diropq_create(struct dentry *dentry,
+					      aufs_bindex_t bindex)
+{
+	return au_diropq_sio(dentry, bindex, AuDiropq_CREATE);
+}
+
+static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE));
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_WHOUT_H__ */
diff --git b/fs/aufs/wkq.c b/fs/aufs/wkq.c
new file mode 100644
index 0000000..0f1500e
--- /dev/null
+++ b/fs/aufs/wkq.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * workqueue for asynchronous/super-io operations
+ * todo: try new dredential scheme
+ */
+
+#include <linux/module.h>
+#include "aufs.h"
+
+/* internal workqueue named AUFS_WKQ_NAME */
+
+static struct workqueue_struct *au_wkq;
+
+struct au_wkinfo {
+	struct work_struct wk;
+	struct kobject *kobj;
+
+	unsigned int flags; /* see wkq.h */
+
+	au_wkq_func_t func;
+	void *args;
+
+	struct completion *comp;
+};
+
+/* ---------------------------------------------------------------------- */
+
+static void wkq_func(struct work_struct *wk)
+{
+	struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk);
+
+	AuDebugOn(!uid_eq(current_fsuid(), GLOBAL_ROOT_UID));
+	AuDebugOn(rlimit(RLIMIT_FSIZE) != RLIM_INFINITY);
+
+	wkinfo->func(wkinfo->args);
+	if (au_ftest_wkq(wkinfo->flags, WAIT))
+		complete(wkinfo->comp);
+	else {
+		kobject_put(wkinfo->kobj);
+		module_put(THIS_MODULE); /* todo: ?? */
+		kfree(wkinfo);
+	}
+}
+
+/*
+ * Since struct completion is large, try allocating it dynamically.
+ */
+#if 1 /* defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) */
+#define AuWkqCompDeclare(name)	struct completion *comp = NULL
+
+static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp)
+{
+	*comp = kmalloc(sizeof(**comp), GFP_NOFS);
+	if (*comp) {
+		init_completion(*comp);
+		wkinfo->comp = *comp;
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+static void au_wkq_comp_free(struct completion *comp)
+{
+	kfree(comp);
+}
+
+#else
+
+/* no braces */
+#define AuWkqCompDeclare(name) \
+	DECLARE_COMPLETION_ONSTACK(_ ## name); \
+	struct completion *comp = &_ ## name
+
+static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp)
+{
+	wkinfo->comp = *comp;
+	return 0;
+}
+
+static void au_wkq_comp_free(struct completion *comp __maybe_unused)
+{
+	/* empty */
+}
+#endif /* 4KSTACKS */
+
+static void au_wkq_run(struct au_wkinfo *wkinfo)
+{
+	if (au_ftest_wkq(wkinfo->flags, NEST)) {
+		if (au_wkq_test()) {
+			AuWarn1("wkq from wkq, unless silly-rename on NFS,"
+				" due to a dead dir by UDBA?\n");
+			AuDebugOn(au_ftest_wkq(wkinfo->flags, WAIT));
+		}
+	} else
+		au_dbg_verify_kthread();
+
+	if (au_ftest_wkq(wkinfo->flags, WAIT)) {
+		INIT_WORK_ONSTACK(&wkinfo->wk, wkq_func);
+		queue_work(au_wkq, &wkinfo->wk);
+	} else {
+		INIT_WORK(&wkinfo->wk, wkq_func);
+		schedule_work(&wkinfo->wk);
+	}
+}
+
+/*
+ * Be careful. It is easy to make deadlock happen.
+ * processA: lock, wkq and wait
+ * processB: wkq and wait, lock in wkq
+ * --> deadlock
+ */
+int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args)
+{
+	int err;
+	AuWkqCompDeclare(comp);
+	struct au_wkinfo wkinfo = {
+		.flags	= flags,
+		.func	= func,
+		.args	= args
+	};
+
+	err = au_wkq_comp_alloc(&wkinfo, &comp);
+	if (!err) {
+		au_wkq_run(&wkinfo);
+		/* no timeout, no interrupt */
+		wait_for_completion(wkinfo.comp);
+		au_wkq_comp_free(comp);
+		destroy_work_on_stack(&wkinfo.wk);
+	}
+
+	return err;
+
+}
+
+/*
+ * Note: dget/dput() in func for aufs dentries are not supported. It will be a
+ * problem in a concurrent umounting.
+ */
+int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb,
+		  unsigned int flags)
+{
+	int err;
+	struct au_wkinfo *wkinfo;
+
+	atomic_inc(&au_sbi(sb)->si_nowait.nw_len);
+
+	/*
+	 * wkq_func() must free this wkinfo.
+	 * it highly depends upon the implementation of workqueue.
+	 */
+	err = 0;
+	wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS);
+	if (wkinfo) {
+		wkinfo->kobj = &au_sbi(sb)->si_kobj;
+		wkinfo->flags = flags & ~AuWkq_WAIT;
+		wkinfo->func = func;
+		wkinfo->args = args;
+		wkinfo->comp = NULL;
+		kobject_get(wkinfo->kobj);
+		__module_get(THIS_MODULE); /* todo: ?? */
+
+		au_wkq_run(wkinfo);
+	} else {
+		err = -ENOMEM;
+		au_nwt_done(&au_sbi(sb)->si_nowait);
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_nwt_init(struct au_nowait_tasks *nwt)
+{
+	atomic_set(&nwt->nw_len, 0);
+	/* smp_mb(); */ /* atomic_set */
+	init_waitqueue_head(&nwt->nw_wq);
+}
+
+void au_wkq_fin(void)
+{
+	destroy_workqueue(au_wkq);
+}
+
+int __init au_wkq_init(void)
+{
+	int err;
+
+	err = 0;
+	au_wkq = alloc_workqueue(AUFS_WKQ_NAME, 0, WQ_DFL_ACTIVE);
+	if (IS_ERR(au_wkq))
+		err = PTR_ERR(au_wkq);
+	else if (!au_wkq)
+		err = -ENOMEM;
+
+	return err;
+}
diff --git b/fs/aufs/wkq.h b/fs/aufs/wkq.h
new file mode 100644
index 0000000..f6c9b99
--- /dev/null
+++ b/fs/aufs/wkq.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * workqueue for asynchronous/super-io operations
+ * todo: try new credentials management scheme
+ */
+
+#ifndef __AUFS_WKQ_H__
+#define __AUFS_WKQ_H__
+
+#ifdef __KERNEL__
+
+struct super_block;
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * in the next operation, wait for the 'nowait' tasks in system-wide workqueue
+ */
+struct au_nowait_tasks {
+	atomic_t		nw_len;
+	wait_queue_head_t	nw_wq;
+};
+
+/* ---------------------------------------------------------------------- */
+
+typedef void (*au_wkq_func_t)(void *args);
+
+/* wkq flags */
+#define AuWkq_WAIT	1
+#define AuWkq_NEST	(1 << 1)
+#define au_ftest_wkq(flags, name)	((flags) & AuWkq_##name)
+#define au_fset_wkq(flags, name) \
+	do { (flags) |= AuWkq_##name; } while (0)
+#define au_fclr_wkq(flags, name) \
+	do { (flags) &= ~AuWkq_##name; } while (0)
+
+#ifndef CONFIG_AUFS_HNOTIFY
+#undef AuWkq_NEST
+#define AuWkq_NEST	0
+#endif
+
+/* wkq.c */
+int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args);
+int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb,
+		  unsigned int flags);
+void au_nwt_init(struct au_nowait_tasks *nwt);
+int __init au_wkq_init(void);
+void au_wkq_fin(void);
+
+/* ---------------------------------------------------------------------- */
+
+static inline int au_wkq_test(void)
+{
+	return current->flags & PF_WQ_WORKER;
+}
+
+static inline int au_wkq_wait(au_wkq_func_t func, void *args)
+{
+	return au_wkq_do_wait(AuWkq_WAIT, func, args);
+}
+
+static inline void au_nwt_done(struct au_nowait_tasks *nwt)
+{
+	if (atomic_dec_and_test(&nwt->nw_len))
+		wake_up_all(&nwt->nw_wq);
+}
+
+static inline int au_nwt_flush(struct au_nowait_tasks *nwt)
+{
+	wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len));
+	return 0;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_WKQ_H__ */
diff --git b/fs/aufs/xattr.c b/fs/aufs/xattr.c
new file mode 100644
index 0000000..890c107
--- /dev/null
+++ b/fs/aufs/xattr.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (C) 2014-2016 Junjiro R. Okajima
+ */
+
+/*
+ * handling xattr functions
+ */
+
+#include <linux/xattr.h>
+#include "aufs.h"
+
+static int au_xattr_ignore(int err, char *name, unsigned int ignore_flags)
+{
+	if (!ignore_flags)
+		goto out;
+	switch (err) {
+	case -ENOMEM:
+	case -EDQUOT:
+		goto out;
+	}
+
+	if ((ignore_flags & AuBrAttr_ICEX) == AuBrAttr_ICEX) {
+		err = 0;
+		goto out;
+	}
+
+#define cmp(brattr, prefix) do {					\
+		if (!strncmp(name, XATTR_##prefix##_PREFIX,		\
+			     XATTR_##prefix##_PREFIX_LEN)) {		\
+			if (ignore_flags & AuBrAttr_ICEX_##brattr)	\
+				err = 0;				\
+			goto out;					\
+		}							\
+	} while (0)
+
+	cmp(SEC, SECURITY);
+	cmp(SYS, SYSTEM);
+	cmp(TR, TRUSTED);
+	cmp(USR, USER);
+#undef cmp
+
+	if (ignore_flags & AuBrAttr_ICEX_OTH)
+		err = 0;
+
+out:
+	return err;
+}
+
+static const int au_xattr_out_of_list = AuBrAttr_ICEX_OTH << 1;
+
+static int au_do_cpup_xattr(struct dentry *h_dst, struct dentry *h_src,
+			    char *name, char **buf, unsigned int ignore_flags,
+			    unsigned int verbose)
+{
+	int err;
+	ssize_t ssz;
+	struct inode *h_idst;
+
+	ssz = vfs_getxattr_alloc(h_src, name, buf, 0, GFP_NOFS);
+	err = ssz;
+	if (unlikely(err <= 0)) {
+		if (err == -ENODATA
+		    || (err == -EOPNOTSUPP
+			&& ((ignore_flags & au_xattr_out_of_list)
+			    || (au_test_nfs_noacl(d_inode(h_src))
+				&& (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS)
+				    || !strcmp(name,
+					       XATTR_NAME_POSIX_ACL_DEFAULT))))
+			    ))
+			err = 0;
+		if (err && (verbose || au_debug_test()))
+			pr_err("%s, err %d\n", name, err);
+		goto out;
+	}
+
+	/* unlock it temporary */
+	h_idst = d_inode(h_dst);
+	inode_unlock(h_idst);
+	err = vfsub_setxattr(h_dst, name, *buf, ssz, /*flags*/0);
+	inode_lock_nested(h_idst, AuLsc_I_CHILD2);
+	if (unlikely(err)) {
+		if (verbose || au_debug_test())
+			pr_err("%s, err %d\n", name, err);
+		err = au_xattr_ignore(err, name, ignore_flags);
+	}
+
+out:
+	return err;
+}
+
+int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags,
+		  unsigned int verbose)
+{
+	int err, unlocked, acl_access, acl_default;
+	ssize_t ssz;
+	struct inode *h_isrc, *h_idst;
+	char *value, *p, *o, *e;
+
+	/* try stopping to update the source inode while we are referencing */
+	/* there should not be the parent-child relationship between them */
+	h_isrc = d_inode(h_src);
+	h_idst = d_inode(h_dst);
+	inode_unlock(h_idst);
+	inode_lock_nested(h_isrc, AuLsc_I_CHILD);
+	inode_lock_nested(h_idst, AuLsc_I_CHILD2);
+	unlocked = 0;
+
+	/* some filesystems don't list POSIX ACL, for example tmpfs */
+	ssz = vfs_listxattr(h_src, NULL, 0);
+	err = ssz;
+	if (unlikely(err < 0)) {
+		AuTraceErr(err);
+		if (err == -ENODATA
+		    || err == -EOPNOTSUPP)
+			err = 0;	/* ignore */
+		goto out;
+	}
+
+	err = 0;
+	p = NULL;
+	o = NULL;
+	if (ssz) {
+		err = -ENOMEM;
+		p = kmalloc(ssz, GFP_NOFS);
+		o = p;
+		if (unlikely(!p))
+			goto out;
+		err = vfs_listxattr(h_src, p, ssz);
+	}
+	inode_unlock(h_isrc);
+	unlocked = 1;
+	AuDbg("err %d, ssz %zd\n", err, ssz);
+	if (unlikely(err < 0))
+		goto out_free;
+
+	err = 0;
+	e = p + ssz;
+	value = NULL;
+	acl_access = 0;
+	acl_default = 0;
+	while (!err && p < e) {
+		acl_access |= !strncmp(p, XATTR_NAME_POSIX_ACL_ACCESS,
+				       sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1);
+		acl_default |= !strncmp(p, XATTR_NAME_POSIX_ACL_DEFAULT,
+					sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)
+					- 1);
+		err = au_do_cpup_xattr(h_dst, h_src, p, &value, ignore_flags,
+				       verbose);
+		p += strlen(p) + 1;
+	}
+	AuTraceErr(err);
+	ignore_flags |= au_xattr_out_of_list;
+	if (!err && !acl_access) {
+		err = au_do_cpup_xattr(h_dst, h_src,
+				       XATTR_NAME_POSIX_ACL_ACCESS, &value,
+				       ignore_flags, verbose);
+		AuTraceErr(err);
+	}
+	if (!err && !acl_default) {
+		err = au_do_cpup_xattr(h_dst, h_src,
+				       XATTR_NAME_POSIX_ACL_DEFAULT, &value,
+				       ignore_flags, verbose);
+		AuTraceErr(err);
+	}
+
+	kfree(value);
+
+out_free:
+	kfree(o);
+out:
+	if (!unlocked)
+		inode_unlock(h_isrc);
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+enum {
+	AU_XATTR_LIST,
+	AU_XATTR_GET
+};
+
+struct au_lgxattr {
+	int type;
+	union {
+		struct {
+			char	*list;
+			size_t	size;
+		} list;
+		struct {
+			const char	*name;
+			void		*value;
+			size_t		size;
+		} get;
+	} u;
+};
+
+static ssize_t au_lgxattr(struct dentry *dentry, struct au_lgxattr *arg)
+{
+	ssize_t err;
+	struct path h_path;
+	struct super_block *sb;
+
+	sb = dentry->d_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out;
+	err = au_h_path_getattr(dentry, /*force*/1, &h_path);
+	if (unlikely(err))
+		goto out_si;
+	if (unlikely(!h_path.dentry))
+		/* illegally overlapped or something */
+		goto out_di; /* pretending success */
+
+	/* always topmost entry only */
+	switch (arg->type) {
+	case AU_XATTR_LIST:
+		err = vfs_listxattr(h_path.dentry,
+				    arg->u.list.list, arg->u.list.size);
+		break;
+	case AU_XATTR_GET:
+		err = vfs_getxattr(h_path.dentry,
+				   arg->u.get.name, arg->u.get.value,
+				   arg->u.get.size);
+		break;
+	}
+
+out_di:
+	di_read_unlock(dentry, AuLock_IR);
+out_si:
+	si_read_unlock(sb);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	struct au_lgxattr arg = {
+		.type = AU_XATTR_LIST,
+		.u.list = {
+			.list	= list,
+			.size	= size
+		},
+	};
+
+	return au_lgxattr(dentry, &arg);
+}
+
+ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value,
+		      size_t size)
+{
+	struct au_lgxattr arg = {
+		.type = AU_XATTR_GET,
+		.u.get = {
+			.name	= name,
+			.value	= value,
+			.size	= size
+		},
+	};
+
+	return au_lgxattr(dentry, &arg);
+}
+
+int aufs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		  size_t size, int flags)
+{
+	struct au_srxattr arg = {
+		.type = AU_XATTR_SET,
+		.u.set = {
+			.name	= name,
+			.value	= value,
+			.size	= size,
+			.flags	= flags
+		},
+	};
+
+	return au_srxattr(dentry, &arg);
+}
+
+int aufs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct au_srxattr arg = {
+		.type = AU_XATTR_REMOVE,
+		.u.remove = {
+			.name	= name
+		},
+	};
+
+	return au_srxattr(dentry, &arg);
+}
+
+/* ---------------------------------------------------------------------- */
+
+#if 0
+static size_t au_xattr_list(struct dentry *dentry, char *list, size_t list_size,
+			    const char *name, size_t name_len, int type)
+{
+	return aufs_listxattr(dentry, list, list_size);
+}
+
+static int au_xattr_get(struct dentry *dentry, const char *name, void *buffer,
+			size_t size, int type)
+{
+	return aufs_getxattr(dentry, name, buffer, size);
+}
+
+static int au_xattr_set(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags, int type)
+{
+	return aufs_setxattr(dentry, name, value, size, flags);
+}
+
+static const struct xattr_handler au_xattr_handler = {
+	/* no prefix, no flags */
+	.list	= au_xattr_list,
+	.get	= au_xattr_get,
+	.set	= au_xattr_set
+	/* why no remove? */
+};
+
+static const struct xattr_handler *au_xattr_handlers[] = {
+	&au_xattr_handler
+};
+
+void au_xattr_init(struct super_block *sb)
+{
+	/* sb->s_xattr = au_xattr_handlers; */
+}
+#endif
diff --git b/fs/aufs/xino.c b/fs/aufs/xino.c
new file mode 100644
index 0000000..2773cae
--- /dev/null
+++ b/fs/aufs/xino.c
@@ -0,0 +1,1305 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * external inode number translation table and bitmap
+ */
+
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include "aufs.h"
+
+/* todo: unnecessary to support mmap_sem since kernel-space? */
+ssize_t xino_fread(vfs_readf_t func, struct file *file, void *kbuf, size_t size,
+		   loff_t *pos)
+{
+	ssize_t err;
+	mm_segment_t oldfs;
+	union {
+		void *k;
+		char __user *u;
+	} buf;
+
+	buf.k = kbuf;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	do {
+		/* todo: signal_pending? */
+		err = func(file, buf.u, size, pos);
+	} while (err == -EAGAIN || err == -EINTR);
+	set_fs(oldfs);
+
+#if 0 /* reserved for future use */
+	if (err > 0)
+		fsnotify_access(file->f_path.dentry);
+#endif
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf,
+			       size_t size, loff_t *pos);
+
+static ssize_t do_xino_fwrite(vfs_writef_t func, struct file *file, void *kbuf,
+			      size_t size, loff_t *pos)
+{
+	ssize_t err;
+	mm_segment_t oldfs;
+	union {
+		void *k;
+		const char __user *u;
+	} buf;
+	int i;
+	const int prevent_endless = 10;
+
+	i = 0;
+	buf.k = kbuf;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	do {
+		err = func(file, buf.u, size, pos);
+		if (err == -EINTR
+		    && !au_wkq_test()
+		    && fatal_signal_pending(current)) {
+			set_fs(oldfs);
+			err = xino_fwrite_wkq(func, file, kbuf, size, pos);
+			BUG_ON(err == -EINTR);
+			oldfs = get_fs();
+			set_fs(KERNEL_DS);
+		}
+	} while (i++ < prevent_endless
+		 && (err == -EAGAIN || err == -EINTR));
+	set_fs(oldfs);
+
+#if 0 /* reserved for future use */
+	if (err > 0)
+		fsnotify_modify(file->f_path.dentry);
+#endif
+
+	return err;
+}
+
+struct do_xino_fwrite_args {
+	ssize_t *errp;
+	vfs_writef_t func;
+	struct file *file;
+	void *buf;
+	size_t size;
+	loff_t *pos;
+};
+
+static void call_do_xino_fwrite(void *args)
+{
+	struct do_xino_fwrite_args *a = args;
+	*a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos);
+}
+
+static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf,
+			       size_t size, loff_t *pos)
+{
+	ssize_t err;
+	int wkq_err;
+	struct do_xino_fwrite_args args = {
+		.errp	= &err,
+		.func	= func,
+		.file	= file,
+		.buf	= buf,
+		.size	= size,
+		.pos	= pos
+	};
+
+	/*
+	 * it breaks RLIMIT_FSIZE and normal user's limit,
+	 * users should care about quota and real 'filesystem full.'
+	 */
+	wkq_err = au_wkq_wait(call_do_xino_fwrite, &args);
+	if (unlikely(wkq_err))
+		err = wkq_err;
+
+	return err;
+}
+
+ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf,
+		    size_t size, loff_t *pos)
+{
+	ssize_t err;
+
+	if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) {
+		lockdep_off();
+		err = do_xino_fwrite(func, file, buf, size, pos);
+		lockdep_on();
+	} else
+		err = xino_fwrite_wkq(func, file, buf, size, pos);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * create a new xinofile at the same place/path as @base_file.
+ */
+struct file *au_xino_create2(struct file *base_file, struct file *copy_src)
+{
+	struct file *file;
+	struct dentry *base, *parent;
+	struct inode *dir, *delegated;
+	struct qstr *name;
+	struct path path;
+	int err;
+
+	base = base_file->f_path.dentry;
+	parent = base->d_parent; /* dir inode is locked */
+	dir = d_inode(parent);
+	IMustLock(dir);
+
+	file = ERR_PTR(-EINVAL);
+	name = &base->d_name;
+	path.dentry = vfsub_lookup_one_len(name->name, parent, name->len);
+	if (IS_ERR(path.dentry)) {
+		file = (void *)path.dentry;
+		pr_err("%pd lookup err %ld\n",
+		       base, PTR_ERR(path.dentry));
+		goto out;
+	}
+
+	/* no need to mnt_want_write() since we call dentry_open() later */
+	err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL);
+	if (unlikely(err)) {
+		file = ERR_PTR(err);
+		pr_err("%pd create err %d\n", base, err);
+		goto out_dput;
+	}
+
+	path.mnt = base_file->f_path.mnt;
+	file = vfsub_dentry_open(&path,
+				 O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE
+				 /* | __FMODE_NONOTIFY */);
+	if (IS_ERR(file)) {
+		pr_err("%pd open err %ld\n", base, PTR_ERR(file));
+		goto out_dput;
+	}
+
+	delegated = NULL;
+	err = vfsub_unlink(dir, &file->f_path, &delegated, /*force*/0);
+	if (unlikely(err == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal unlink\n");
+		iput(delegated);
+	}
+	if (unlikely(err)) {
+		pr_err("%pd unlink err %d\n", base, err);
+		goto out_fput;
+	}
+
+	if (copy_src) {
+		/* no one can touch copy_src xino */
+		err = au_copy_file(file, copy_src, vfsub_f_size_read(copy_src));
+		if (unlikely(err)) {
+			pr_err("%pd copy err %d\n", base, err);
+			goto out_fput;
+		}
+	}
+	goto out_dput; /* success */
+
+out_fput:
+	fput(file);
+	file = ERR_PTR(err);
+out_dput:
+	dput(path.dentry);
+out:
+	return file;
+}
+
+struct au_xino_lock_dir {
+	struct au_hinode *hdir;
+	struct dentry *parent;
+	struct inode *dir;
+};
+
+static void au_xino_lock_dir(struct super_block *sb, struct file *xino,
+			     struct au_xino_lock_dir *ldir)
+{
+	aufs_bindex_t brid, bindex;
+
+	ldir->hdir = NULL;
+	bindex = -1;
+	brid = au_xino_brid(sb);
+	if (brid >= 0)
+		bindex = au_br_index(sb, brid);
+	if (bindex >= 0) {
+		ldir->hdir = au_hi(d_inode(sb->s_root), bindex);
+		au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT);
+	} else {
+		ldir->parent = dget_parent(xino->f_path.dentry);
+		ldir->dir = d_inode(ldir->parent);
+		inode_lock_nested(ldir->dir, AuLsc_I_PARENT);
+	}
+}
+
+static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir)
+{
+	if (ldir->hdir)
+		au_hn_imtx_unlock(ldir->hdir);
+	else {
+		inode_unlock(ldir->dir);
+		dput(ldir->parent);
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* trucate xino files asynchronously */
+
+int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex)
+{
+	int err;
+	unsigned long jiffy;
+	blkcnt_t blocks;
+	aufs_bindex_t bi, bend;
+	struct kstatfs *st;
+	struct au_branch *br;
+	struct file *new_xino, *file;
+	struct super_block *h_sb;
+	struct au_xino_lock_dir ldir;
+
+	err = -ENOMEM;
+	st = kmalloc(sizeof(*st), GFP_NOFS);
+	if (unlikely(!st))
+		goto out;
+
+	err = -EINVAL;
+	bend = au_sbend(sb);
+	if (unlikely(bindex < 0 || bend < bindex))
+		goto out_st;
+	br = au_sbr(sb, bindex);
+	file = br->br_xino.xi_file;
+	if (!file)
+		goto out_st;
+
+	err = vfs_statfs(&file->f_path, st);
+	if (unlikely(err))
+		AuErr1("statfs err %d, ignored\n", err);
+	jiffy = jiffies;
+	blocks = file_inode(file)->i_blocks;
+	pr_info("begin truncating xino(b%d), ib%llu, %llu/%llu free blks\n",
+		bindex, (u64)blocks, st->f_bfree, st->f_blocks);
+
+	au_xino_lock_dir(sb, file, &ldir);
+	/* mnt_want_write() is unnecessary here */
+	new_xino = au_xino_create2(file, file);
+	au_xino_unlock_dir(&ldir);
+	err = PTR_ERR(new_xino);
+	if (IS_ERR(new_xino)) {
+		pr_err("err %d, ignored\n", err);
+		goto out_st;
+	}
+	err = 0;
+	fput(file);
+	br->br_xino.xi_file = new_xino;
+
+	h_sb = au_br_sb(br);
+	for (bi = 0; bi <= bend; bi++) {
+		if (unlikely(bi == bindex))
+			continue;
+		br = au_sbr(sb, bi);
+		if (au_br_sb(br) != h_sb)
+			continue;
+
+		fput(br->br_xino.xi_file);
+		br->br_xino.xi_file = new_xino;
+		get_file(new_xino);
+	}
+
+	err = vfs_statfs(&new_xino->f_path, st);
+	if (!err) {
+		pr_info("end truncating xino(b%d), ib%llu, %llu/%llu free blks\n",
+			bindex, (u64)file_inode(new_xino)->i_blocks,
+			st->f_bfree, st->f_blocks);
+		if (file_inode(new_xino)->i_blocks < blocks)
+			au_sbi(sb)->si_xino_jiffy = jiffy;
+	} else
+		AuErr1("statfs err %d, ignored\n", err);
+
+out_st:
+	kfree(st);
+out:
+	return err;
+}
+
+struct xino_do_trunc_args {
+	struct super_block *sb;
+	struct au_branch *br;
+};
+
+static void xino_do_trunc(void *_args)
+{
+	struct xino_do_trunc_args *args = _args;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct inode *dir;
+	int err;
+	aufs_bindex_t bindex;
+
+	err = 0;
+	sb = args->sb;
+	dir = d_inode(sb->s_root);
+	br = args->br;
+
+	si_noflush_write_lock(sb);
+	ii_read_lock_parent(dir);
+	bindex = au_br_index(sb, br->br_id);
+	err = au_xino_trunc(sb, bindex);
+	ii_read_unlock(dir);
+	if (unlikely(err))
+		pr_warn("err b%d, (%d)\n", bindex, err);
+	atomic_dec(&br->br_xino_running);
+	atomic_dec(&br->br_count);
+	si_write_unlock(sb);
+	au_nwt_done(&au_sbi(sb)->si_nowait);
+	kfree(args);
+}
+
+static int xino_trunc_test(struct super_block *sb, struct au_branch *br)
+{
+	int err;
+	struct kstatfs st;
+	struct au_sbinfo *sbinfo;
+
+	/* todo: si_xino_expire and the ratio should be customizable */
+	sbinfo = au_sbi(sb);
+	if (time_before(jiffies,
+			sbinfo->si_xino_jiffy + sbinfo->si_xino_expire))
+		return 0;
+
+	/* truncation border */
+	err = vfs_statfs(&br->br_xino.xi_file->f_path, &st);
+	if (unlikely(err)) {
+		AuErr1("statfs err %d, ignored\n", err);
+		return 0;
+	}
+	if (div64_u64(st.f_bfree * 100, st.f_blocks) >= AUFS_XINO_DEF_TRUNC)
+		return 0;
+
+	return 1;
+}
+
+static void xino_try_trunc(struct super_block *sb, struct au_branch *br)
+{
+	struct xino_do_trunc_args *args;
+	int wkq_err;
+
+	if (!xino_trunc_test(sb, br))
+		return;
+
+	if (atomic_inc_return(&br->br_xino_running) > 1)
+		goto out;
+
+	/* lock and kfree() will be called in trunc_xino() */
+	args = kmalloc(sizeof(*args), GFP_NOFS);
+	if (unlikely(!args)) {
+		AuErr1("no memory\n");
+		goto out_args;
+	}
+
+	atomic_inc(&br->br_count);
+	args->sb = sb;
+	args->br = br;
+	wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0);
+	if (!wkq_err)
+		return; /* success */
+
+	pr_err("wkq %d\n", wkq_err);
+	atomic_dec(&br->br_count);
+
+out_args:
+	kfree(args);
+out:
+	atomic_dec(&br->br_xino_running);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_xino_do_write(vfs_writef_t write, struct file *file,
+			    ino_t h_ino, ino_t ino)
+{
+	loff_t pos;
+	ssize_t sz;
+
+	pos = h_ino;
+	if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) {
+		AuIOErr1("too large hi%lu\n", (unsigned long)h_ino);
+		return -EFBIG;
+	}
+	pos *= sizeof(ino);
+	sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos);
+	if (sz == sizeof(ino))
+		return 0; /* success */
+
+	AuIOErr("write failed (%zd)\n", sz);
+	return -EIO;
+}
+
+/*
+ * write @ino to the xinofile for the specified branch{@sb, @bindex}
+ * at the position of @h_ino.
+ * even if @ino is zero, it is written to the xinofile and means no entry.
+ * if the size of the xino file on a specific filesystem exceeds the watermark,
+ * try truncating it.
+ */
+int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+		  ino_t ino)
+{
+	int err;
+	unsigned int mnt_flags;
+	struct au_branch *br;
+
+	BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max)
+		     || ((loff_t)-1) > 0);
+	SiMustAnyLock(sb);
+
+	mnt_flags = au_mntflags(sb);
+	if (!au_opt_test(mnt_flags, XINO))
+		return 0;
+
+	br = au_sbr(sb, bindex);
+	err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file,
+			       h_ino, ino);
+	if (!err) {
+		if (au_opt_test(mnt_flags, TRUNC_XINO)
+		    && au_test_fs_trunc_xino(au_br_sb(br)))
+			xino_try_trunc(sb, br);
+		return 0; /* success */
+	}
+
+	AuIOErr("write failed (%d)\n", err);
+	return -EIO;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* aufs inode number bitmap */
+
+static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE;
+static ino_t xib_calc_ino(unsigned long pindex, int bit)
+{
+	ino_t ino;
+
+	AuDebugOn(bit < 0 || page_bits <= bit);
+	ino = AUFS_FIRST_INO + pindex * page_bits + bit;
+	return ino;
+}
+
+static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit)
+{
+	AuDebugOn(ino < AUFS_FIRST_INO);
+	ino -= AUFS_FIRST_INO;
+	*pindex = ino / page_bits;
+	*bit = ino % page_bits;
+}
+
+static int xib_pindex(struct super_block *sb, unsigned long pindex)
+{
+	int err;
+	loff_t pos;
+	ssize_t sz;
+	struct au_sbinfo *sbinfo;
+	struct file *xib;
+	unsigned long *p;
+
+	sbinfo = au_sbi(sb);
+	MtxMustLock(&sbinfo->si_xib_mtx);
+	AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE
+		  || !au_opt_test(sbinfo->si_mntflags, XINO));
+
+	if (pindex == sbinfo->si_xib_last_pindex)
+		return 0;
+
+	xib = sbinfo->si_xib;
+	p = sbinfo->si_xib_buf;
+	pos = sbinfo->si_xib_last_pindex;
+	pos *= PAGE_SIZE;
+	sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos);
+	if (unlikely(sz != PAGE_SIZE))
+		goto out;
+
+	pos = pindex;
+	pos *= PAGE_SIZE;
+	if (vfsub_f_size_read(xib) >= pos + PAGE_SIZE)
+		sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos);
+	else {
+		memset(p, 0, PAGE_SIZE);
+		sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos);
+	}
+	if (sz == PAGE_SIZE) {
+		sbinfo->si_xib_last_pindex = pindex;
+		return 0; /* success */
+	}
+
+out:
+	AuIOErr1("write failed (%zd)\n", sz);
+	err = sz;
+	if (sz >= 0)
+		err = -EIO;
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void au_xib_clear_bit(struct inode *inode)
+{
+	int err, bit;
+	unsigned long pindex;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+
+	AuDebugOn(inode->i_nlink);
+
+	sb = inode->i_sb;
+	xib_calc_bit(inode->i_ino, &pindex, &bit);
+	AuDebugOn(page_bits <= bit);
+	sbinfo = au_sbi(sb);
+	mutex_lock(&sbinfo->si_xib_mtx);
+	err = xib_pindex(sb, pindex);
+	if (!err) {
+		clear_bit(bit, sbinfo->si_xib_buf);
+		sbinfo->si_xib_next_bit = bit;
+	}
+	mutex_unlock(&sbinfo->si_xib_mtx);
+}
+
+/* for s_op->delete_inode() */
+void au_xino_delete_inode(struct inode *inode, const int unlinked)
+{
+	int err;
+	unsigned int mnt_flags;
+	aufs_bindex_t bindex, bend, bi;
+	unsigned char try_trunc;
+	struct au_iinfo *iinfo;
+	struct super_block *sb;
+	struct au_hinode *hi;
+	struct inode *h_inode;
+	struct au_branch *br;
+	vfs_writef_t xwrite;
+
+	sb = inode->i_sb;
+	mnt_flags = au_mntflags(sb);
+	if (!au_opt_test(mnt_flags, XINO)
+	    || inode->i_ino == AUFS_ROOT_INO)
+		return;
+
+	if (unlinked) {
+		au_xigen_inc(inode);
+		au_xib_clear_bit(inode);
+	}
+
+	iinfo = au_ii(inode);
+	if (!iinfo)
+		return;
+
+	bindex = iinfo->ii_bstart;
+	if (bindex < 0)
+		return;
+
+	xwrite = au_sbi(sb)->si_xwrite;
+	try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO);
+	hi = iinfo->ii_hinode + bindex;
+	bend = iinfo->ii_bend;
+	for (; bindex <= bend; bindex++, hi++) {
+		h_inode = hi->hi_inode;
+		if (!h_inode
+		    || (!unlinked && h_inode->i_nlink))
+			continue;
+
+		/* inode may not be revalidated */
+		bi = au_br_index(sb, hi->hi_id);
+		if (bi < 0)
+			continue;
+
+		br = au_sbr(sb, bi);
+		err = au_xino_do_write(xwrite, br->br_xino.xi_file,
+				       h_inode->i_ino, /*ino*/0);
+		if (!err && try_trunc
+		    && au_test_fs_trunc_xino(au_br_sb(br)))
+			xino_try_trunc(sb, br);
+	}
+}
+
+/* get an unused inode number from bitmap */
+ino_t au_xino_new_ino(struct super_block *sb)
+{
+	ino_t ino;
+	unsigned long *p, pindex, ul, pend;
+	struct au_sbinfo *sbinfo;
+	struct file *file;
+	int free_bit, err;
+
+	if (!au_opt_test(au_mntflags(sb), XINO))
+		return iunique(sb, AUFS_FIRST_INO);
+
+	sbinfo = au_sbi(sb);
+	mutex_lock(&sbinfo->si_xib_mtx);
+	p = sbinfo->si_xib_buf;
+	free_bit = sbinfo->si_xib_next_bit;
+	if (free_bit < page_bits && !test_bit(free_bit, p))
+		goto out; /* success */
+	free_bit = find_first_zero_bit(p, page_bits);
+	if (free_bit < page_bits)
+		goto out; /* success */
+
+	pindex = sbinfo->si_xib_last_pindex;
+	for (ul = pindex - 1; ul < ULONG_MAX; ul--) {
+		err = xib_pindex(sb, ul);
+		if (unlikely(err))
+			goto out_err;
+		free_bit = find_first_zero_bit(p, page_bits);
+		if (free_bit < page_bits)
+			goto out; /* success */
+	}
+
+	file = sbinfo->si_xib;
+	pend = vfsub_f_size_read(file) / PAGE_SIZE;
+	for (ul = pindex + 1; ul <= pend; ul++) {
+		err = xib_pindex(sb, ul);
+		if (unlikely(err))
+			goto out_err;
+		free_bit = find_first_zero_bit(p, page_bits);
+		if (free_bit < page_bits)
+			goto out; /* success */
+	}
+	BUG();
+
+out:
+	set_bit(free_bit, p);
+	sbinfo->si_xib_next_bit = free_bit + 1;
+	pindex = sbinfo->si_xib_last_pindex;
+	mutex_unlock(&sbinfo->si_xib_mtx);
+	ino = xib_calc_ino(pindex, free_bit);
+	AuDbg("i%lu\n", (unsigned long)ino);
+	return ino;
+out_err:
+	mutex_unlock(&sbinfo->si_xib_mtx);
+	AuDbg("i0\n");
+	return 0;
+}
+
+/*
+ * read @ino from xinofile for the specified branch{@sb, @bindex}
+ * at the position of @h_ino.
+ * if @ino does not exist and @do_new is true, get new one.
+ */
+int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+		 ino_t *ino)
+{
+	int err;
+	ssize_t sz;
+	loff_t pos;
+	struct file *file;
+	struct au_sbinfo *sbinfo;
+
+	*ino = 0;
+	if (!au_opt_test(au_mntflags(sb), XINO))
+		return 0; /* no xino */
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	pos = h_ino;
+	if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) {
+		AuIOErr1("too large hi%lu\n", (unsigned long)h_ino);
+		return -EFBIG;
+	}
+	pos *= sizeof(*ino);
+
+	file = au_sbr(sb, bindex)->br_xino.xi_file;
+	if (vfsub_f_size_read(file) < pos + sizeof(*ino))
+		return 0; /* no ino */
+
+	sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos);
+	if (sz == sizeof(*ino))
+		return 0; /* success */
+
+	err = sz;
+	if (unlikely(sz >= 0)) {
+		err = -EIO;
+		AuIOErr("xino read error (%zd)\n", sz);
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* create and set a new xino file */
+
+struct file *au_xino_create(struct super_block *sb, char *fname, int silent)
+{
+	struct file *file;
+	struct dentry *h_parent, *d;
+	struct inode *h_dir, *inode;
+	int err;
+
+	/*
+	 * at mount-time, and the xino file is the default path,
+	 * hnotify is disabled so we have no notify events to ignore.
+	 * when a user specified the xino, we cannot get au_hdir to be ignored.
+	 */
+	file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE
+			       /* | __FMODE_NONOTIFY */,
+			       S_IRUGO | S_IWUGO);
+	if (IS_ERR(file)) {
+		if (!silent)
+			pr_err("open %s(%ld)\n", fname, PTR_ERR(file));
+		return file;
+	}
+
+	/* keep file count */
+	err = 0;
+	inode = file_inode(file);
+	h_parent = dget_parent(file->f_path.dentry);
+	h_dir = d_inode(h_parent);
+	inode_lock_nested(h_dir, AuLsc_I_PARENT);
+	/* mnt_want_write() is unnecessary here */
+	/* no delegation since it is just created */
+	if (inode->i_nlink)
+		err = vfsub_unlink(h_dir, &file->f_path, /*delegated*/NULL,
+				   /*force*/0);
+	inode_unlock(h_dir);
+	dput(h_parent);
+	if (unlikely(err)) {
+		if (!silent)
+			pr_err("unlink %s(%d)\n", fname, err);
+		goto out;
+	}
+
+	err = -EINVAL;
+	d = file->f_path.dentry;
+	if (unlikely(sb == d->d_sb)) {
+		if (!silent)
+			pr_err("%s must be outside\n", fname);
+		goto out;
+	}
+	if (unlikely(au_test_fs_bad_xino(d->d_sb))) {
+		if (!silent)
+			pr_err("xino doesn't support %s(%s)\n",
+			       fname, au_sbtype(d->d_sb));
+		goto out;
+	}
+	return file; /* success */
+
+out:
+	fput(file);
+	file = ERR_PTR(err);
+	return file;
+}
+
+/*
+ * find another branch who is on the same filesystem of the specified
+ * branch{@btgt}. search until @bend.
+ */
+static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt,
+			aufs_bindex_t bend)
+{
+	aufs_bindex_t bindex;
+	struct super_block *tgt_sb = au_sbr_sb(sb, btgt);
+
+	for (bindex = 0; bindex < btgt; bindex++)
+		if (unlikely(tgt_sb == au_sbr_sb(sb, bindex)))
+			return bindex;
+	for (bindex++; bindex <= bend; bindex++)
+		if (unlikely(tgt_sb == au_sbr_sb(sb, bindex)))
+			return bindex;
+	return -1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * initialize the xinofile for the specified branch @br
+ * at the place/path where @base_file indicates.
+ * test whether another branch is on the same filesystem or not,
+ * if @do_test is true.
+ */
+int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino,
+	       struct file *base_file, int do_test)
+{
+	int err;
+	ino_t ino;
+	aufs_bindex_t bend, bindex;
+	struct au_branch *shared_br, *b;
+	struct file *file;
+	struct super_block *tgt_sb;
+
+	shared_br = NULL;
+	bend = au_sbend(sb);
+	if (do_test) {
+		tgt_sb = au_br_sb(br);
+		for (bindex = 0; bindex <= bend; bindex++) {
+			b = au_sbr(sb, bindex);
+			if (tgt_sb == au_br_sb(b)) {
+				shared_br = b;
+				break;
+			}
+		}
+	}
+
+	if (!shared_br || !shared_br->br_xino.xi_file) {
+		struct au_xino_lock_dir ldir;
+
+		au_xino_lock_dir(sb, base_file, &ldir);
+		/* mnt_want_write() is unnecessary here */
+		file = au_xino_create2(base_file, NULL);
+		au_xino_unlock_dir(&ldir);
+		err = PTR_ERR(file);
+		if (IS_ERR(file))
+			goto out;
+		br->br_xino.xi_file = file;
+	} else {
+		br->br_xino.xi_file = shared_br->br_xino.xi_file;
+		get_file(br->br_xino.xi_file);
+	}
+
+	ino = AUFS_ROOT_INO;
+	err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file,
+			       h_ino, ino);
+	if (unlikely(err)) {
+		fput(br->br_xino.xi_file);
+		br->br_xino.xi_file = NULL;
+	}
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* trucate a xino bitmap file */
+
+/* todo: slow */
+static int do_xib_restore(struct super_block *sb, struct file *file, void *page)
+{
+	int err, bit;
+	ssize_t sz;
+	unsigned long pindex;
+	loff_t pos, pend;
+	struct au_sbinfo *sbinfo;
+	vfs_readf_t func;
+	ino_t *ino;
+	unsigned long *p;
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	MtxMustLock(&sbinfo->si_xib_mtx);
+	p = sbinfo->si_xib_buf;
+	func = sbinfo->si_xread;
+	pend = vfsub_f_size_read(file);
+	pos = 0;
+	while (pos < pend) {
+		sz = xino_fread(func, file, page, PAGE_SIZE, &pos);
+		err = sz;
+		if (unlikely(sz <= 0))
+			goto out;
+
+		err = 0;
+		for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) {
+			if (unlikely(*ino < AUFS_FIRST_INO))
+				continue;
+
+			xib_calc_bit(*ino, &pindex, &bit);
+			AuDebugOn(page_bits <= bit);
+			err = xib_pindex(sb, pindex);
+			if (!err)
+				set_bit(bit, p);
+			else
+				goto out;
+		}
+	}
+
+out:
+	return err;
+}
+
+static int xib_restore(struct super_block *sb)
+{
+	int err;
+	aufs_bindex_t bindex, bend;
+	void *page;
+
+	err = -ENOMEM;
+	page = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!page))
+		goto out;
+
+	err = 0;
+	bend = au_sbend(sb);
+	for (bindex = 0; !err && bindex <= bend; bindex++)
+		if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0)
+			err = do_xib_restore
+				(sb, au_sbr(sb, bindex)->br_xino.xi_file, page);
+		else
+			AuDbg("b%d\n", bindex);
+	free_page((unsigned long)page);
+
+out:
+	return err;
+}
+
+int au_xib_trunc(struct super_block *sb)
+{
+	int err;
+	ssize_t sz;
+	loff_t pos;
+	struct au_xino_lock_dir ldir;
+	struct au_sbinfo *sbinfo;
+	unsigned long *p;
+	struct file *file;
+
+	SiMustWriteLock(sb);
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	if (!au_opt_test(sbinfo->si_mntflags, XINO))
+		goto out;
+
+	file = sbinfo->si_xib;
+	if (vfsub_f_size_read(file) <= PAGE_SIZE)
+		goto out;
+
+	au_xino_lock_dir(sb, file, &ldir);
+	/* mnt_want_write() is unnecessary here */
+	file = au_xino_create2(sbinfo->si_xib, NULL);
+	au_xino_unlock_dir(&ldir);
+	err = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+	fput(sbinfo->si_xib);
+	sbinfo->si_xib = file;
+
+	p = sbinfo->si_xib_buf;
+	memset(p, 0, PAGE_SIZE);
+	pos = 0;
+	sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos);
+	if (unlikely(sz != PAGE_SIZE)) {
+		err = sz;
+		AuIOErr("err %d\n", err);
+		if (sz >= 0)
+			err = -EIO;
+		goto out;
+	}
+
+	mutex_lock(&sbinfo->si_xib_mtx);
+	/* mnt_want_write() is unnecessary here */
+	err = xib_restore(sb);
+	mutex_unlock(&sbinfo->si_xib_mtx);
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * xino mount option handlers
+ */
+
+/* xino bitmap */
+static void xino_clear_xib(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	sbinfo->si_xread = NULL;
+	sbinfo->si_xwrite = NULL;
+	if (sbinfo->si_xib)
+		fput(sbinfo->si_xib);
+	sbinfo->si_xib = NULL;
+	free_page((unsigned long)sbinfo->si_xib_buf);
+	sbinfo->si_xib_buf = NULL;
+}
+
+static int au_xino_set_xib(struct super_block *sb, struct file *base)
+{
+	int err;
+	loff_t pos;
+	struct au_sbinfo *sbinfo;
+	struct file *file;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	file = au_xino_create2(base, sbinfo->si_xib);
+	err = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+	if (sbinfo->si_xib)
+		fput(sbinfo->si_xib);
+	sbinfo->si_xib = file;
+	sbinfo->si_xread = vfs_readf(file);
+	sbinfo->si_xwrite = vfs_writef(file);
+
+	err = -ENOMEM;
+	if (!sbinfo->si_xib_buf)
+		sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS);
+	if (unlikely(!sbinfo->si_xib_buf))
+		goto out_unset;
+
+	sbinfo->si_xib_last_pindex = 0;
+	sbinfo->si_xib_next_bit = 0;
+	if (vfsub_f_size_read(file) < PAGE_SIZE) {
+		pos = 0;
+		err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf,
+				  PAGE_SIZE, &pos);
+		if (unlikely(err != PAGE_SIZE))
+			goto out_free;
+	}
+	err = 0;
+	goto out; /* success */
+
+out_free:
+	free_page((unsigned long)sbinfo->si_xib_buf);
+	sbinfo->si_xib_buf = NULL;
+	if (err >= 0)
+		err = -EIO;
+out_unset:
+	fput(sbinfo->si_xib);
+	sbinfo->si_xib = NULL;
+	sbinfo->si_xread = NULL;
+	sbinfo->si_xwrite = NULL;
+out:
+	return err;
+}
+
+/* xino for each branch */
+static void xino_clear_br(struct super_block *sb)
+{
+	aufs_bindex_t bindex, bend;
+	struct au_branch *br;
+
+	bend = au_sbend(sb);
+	for (bindex = 0; bindex <= bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		if (!br || !br->br_xino.xi_file)
+			continue;
+
+		fput(br->br_xino.xi_file);
+		br->br_xino.xi_file = NULL;
+	}
+}
+
+static int au_xino_set_br(struct super_block *sb, struct file *base)
+{
+	int err;
+	ino_t ino;
+	aufs_bindex_t bindex, bend, bshared;
+	struct {
+		struct file *old, *new;
+	} *fpair, *p;
+	struct au_branch *br;
+	struct inode *inode;
+	vfs_writef_t writef;
+
+	SiMustWriteLock(sb);
+
+	err = -ENOMEM;
+	bend = au_sbend(sb);
+	fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS);
+	if (unlikely(!fpair))
+		goto out;
+
+	inode = d_inode(sb->s_root);
+	ino = AUFS_ROOT_INO;
+	writef = au_sbi(sb)->si_xwrite;
+	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) {
+		br = au_sbr(sb, bindex);
+		bshared = is_sb_shared(sb, bindex, bindex - 1);
+		if (bshared >= 0) {
+			/* shared xino */
+			*p = fpair[bshared];
+			get_file(p->new);
+		}
+
+		if (!p->new) {
+			/* new xino */
+			p->old = br->br_xino.xi_file;
+			p->new = au_xino_create2(base, br->br_xino.xi_file);
+			err = PTR_ERR(p->new);
+			if (IS_ERR(p->new)) {
+				p->new = NULL;
+				goto out_pair;
+			}
+		}
+
+		err = au_xino_do_write(writef, p->new,
+				       au_h_iptr(inode, bindex)->i_ino, ino);
+		if (unlikely(err))
+			goto out_pair;
+	}
+
+	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) {
+		br = au_sbr(sb, bindex);
+		if (br->br_xino.xi_file)
+			fput(br->br_xino.xi_file);
+		get_file(p->new);
+		br->br_xino.xi_file = p->new;
+	}
+
+out_pair:
+	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++)
+		if (p->new)
+			fput(p->new);
+		else
+			break;
+	kfree(fpair);
+out:
+	return err;
+}
+
+void au_xino_clr(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+
+	au_xigen_clr(sb);
+	xino_clear_xib(sb);
+	xino_clear_br(sb);
+	sbinfo = au_sbi(sb);
+	/* lvalue, do not call au_mntflags() */
+	au_opt_clr(sbinfo->si_mntflags, XINO);
+}
+
+int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount)
+{
+	int err, skip;
+	struct dentry *parent, *cur_parent;
+	struct qstr *dname, *cur_name;
+	struct file *cur_xino;
+	struct inode *dir;
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	parent = dget_parent(xino->file->f_path.dentry);
+	if (remount) {
+		skip = 0;
+		dname = &xino->file->f_path.dentry->d_name;
+		cur_xino = sbinfo->si_xib;
+		if (cur_xino) {
+			cur_parent = dget_parent(cur_xino->f_path.dentry);
+			cur_name = &cur_xino->f_path.dentry->d_name;
+			skip = (cur_parent == parent
+				&& au_qstreq(dname, cur_name));
+			dput(cur_parent);
+		}
+		if (skip)
+			goto out;
+	}
+
+	au_opt_set(sbinfo->si_mntflags, XINO);
+	dir = d_inode(parent);
+	inode_lock_nested(dir, AuLsc_I_PARENT);
+	/* mnt_want_write() is unnecessary here */
+	err = au_xino_set_xib(sb, xino->file);
+	if (!err)
+		err = au_xigen_set(sb, xino->file);
+	if (!err)
+		err = au_xino_set_br(sb, xino->file);
+	inode_unlock(dir);
+	if (!err)
+		goto out; /* success */
+
+	/* reset all */
+	AuIOErr("failed creating xino(%d).\n", err);
+	au_xigen_clr(sb);
+	xino_clear_xib(sb);
+
+out:
+	dput(parent);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * create a xinofile at the default place/path.
+ */
+struct file *au_xino_def(struct super_block *sb)
+{
+	struct file *file;
+	char *page, *p;
+	struct au_branch *br;
+	struct super_block *h_sb;
+	struct path path;
+	aufs_bindex_t bend, bindex, bwr;
+
+	br = NULL;
+	bend = au_sbend(sb);
+	bwr = -1;
+	for (bindex = 0; bindex <= bend; bindex++) {
+		br = au_sbr(sb, bindex);
+		if (au_br_writable(br->br_perm)
+		    && !au_test_fs_bad_xino(au_br_sb(br))) {
+			bwr = bindex;
+			break;
+		}
+	}
+
+	if (bwr >= 0) {
+		file = ERR_PTR(-ENOMEM);
+		page = (void *)__get_free_page(GFP_NOFS);
+		if (unlikely(!page))
+			goto out;
+		path.mnt = au_br_mnt(br);
+		path.dentry = au_h_dptr(sb->s_root, bwr);
+		p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME));
+		file = (void *)p;
+		if (!IS_ERR(p)) {
+			strcat(p, "/" AUFS_XINO_FNAME);
+			AuDbg("%s\n", p);
+			file = au_xino_create(sb, p, /*silent*/0);
+			if (!IS_ERR(file))
+				au_xino_brid_set(sb, br->br_id);
+		}
+		free_page((unsigned long)page);
+	} else {
+		file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0);
+		if (IS_ERR(file))
+			goto out;
+		h_sb = file->f_path.dentry->d_sb;
+		if (unlikely(au_test_fs_bad_xino(h_sb))) {
+			pr_err("xino doesn't support %s(%s)\n",
+			       AUFS_XINO_DEFPATH, au_sbtype(h_sb));
+			fput(file);
+			file = ERR_PTR(-EINVAL);
+		}
+		if (!IS_ERR(file))
+			au_xino_brid_set(sb, -1);
+	}
+
+out:
+	return file;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_xino_path(struct seq_file *seq, struct file *file)
+{
+	int err;
+
+	err = au_seq_path(seq, &file->f_path);
+	if (unlikely(err))
+		goto out;
+
+#define Deleted "\\040(deleted)"
+	seq->count -= sizeof(Deleted) - 1;
+	AuDebugOn(memcmp(seq->buf + seq->count, Deleted,
+			 sizeof(Deleted) - 1));
+#undef Deleted
+
+out:
+	return err;
+}
diff --git a/fs/dcache.c b/fs/dcache.c
index 7566b26..6bdabe2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1156,7 +1156,7 @@ enum d_walk_ret {
  *
  * The @enter() and @finish() callbacks are called with d_lock held.
  */
-static void d_walk(struct dentry *parent, void *data,
+void d_walk(struct dentry *parent, void *data,
 		   enum d_walk_ret (*enter)(void *, struct dentry *),
 		   void (*finish)(void *))
 {
@@ -1261,6 +1261,7 @@ rename_retry:
 	seq = 1;
 	goto again;
 }
+EXPORT_SYMBOL_GPL(d_walk);
 
 /*
  * Search for at least 1 mount point in the dentry's subdirs.
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d72d52b..d7ca3d3 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,11 +8,14 @@
 #include <linux/writeback.h>
 #include <linux/sysctl.h>
 #include <linux/gfp.h>
+#include <linux/export.h>
 #include "internal.h"
 
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
 
+extern void iterate_supers_no_sb_lock(void (*f)(struct super_block *, void *), void *arg);
+
 static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
 	struct inode *inode, *toput_inode = NULL;
@@ -39,6 +42,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 	iput(toput_inode);
 }
 
+/* For TuxOnIce */
+void drop_pagecache(void)
+{
+	iterate_supers_no_sb_lock(drop_pagecache_sb, NULL);
+}
+
 int drop_caches_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
diff --git a/fs/exec.c b/fs/exec.c
index dcd4ac7..2cc69a4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -57,6 +57,8 @@
 #include <linux/oom.h>
 #include <linux/compat.h>
 
+#include <trace/events/fs.h>
+
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
@@ -103,6 +105,7 @@ bool path_noexec(const struct path *path)
 	return (path->mnt->mnt_flags & MNT_NOEXEC) ||
 	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
 }
+EXPORT_SYMBOL_GPL(path_noexec);
 
 #ifdef CONFIG_USELIB
 /*
@@ -790,8 +793,10 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
 	if (err)
 		goto exit;
 
-	if (name->name[0] != '\0')
+	if (name->name[0] != '\0') {
 		fsnotify_open(file);
+		trace_open_exec(name->name);
+	}
 
 out:
 	return file;
diff --git b/fs/exfat/Kconfig b/fs/exfat/Kconfig
new file mode 100644
index 0000000..78b32aa
--- /dev/null
+++ b/fs/exfat/Kconfig
@@ -0,0 +1,39 @@
+config EXFAT_FS
+	tristate "exFAT fs support"
+	select NLS
+	help
+	  This adds support for the exFAT file system.
+
+config EXFAT_DISCARD
+	bool "enable discard support"
+	depends on EXFAT_FS
+	default y
+
+config EXFAT_DELAYED_SYNC
+	bool "enable delayed sync"
+	depends on EXFAT_FS
+	default n
+
+config EXFAT_KERNEL_DEBUG
+	bool "enable kernel debug features via ioctl"
+	depends on EXFAT_FS
+	default n
+
+config EXFAT_DEBUG_MSG
+	bool "print debug messages"
+	depends on EXFAT_FS
+	default n
+
+config EXFAT_DEFAULT_CODEPAGE
+	int "Default codepage for exFAT"
+	default 437
+	depends on EXFAT_FS
+	help
+	  This option should be set to the codepage of your exFAT filesystems.
+
+config EXFAT_DEFAULT_IOCHARSET
+	string "Default iocharset for exFAT"
+	default "utf8"
+	depends on EXFAT_FS
+	help
+	  Set this to the default input/output character set you'd like exFAT to use.
diff --git b/fs/exfat/LICENSE b/fs/exfat/LICENSE
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/fs/exfat/LICENSE
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git b/fs/exfat/Makefile b/fs/exfat/Makefile
new file mode 100644
index 0000000..fb2be9d
--- /dev/null
+++ b/fs/exfat/Makefile
@@ -0,0 +1,55 @@
+#
+# Makefile for Linux FAT12/FAT16/FAT32(VFAT)/FAT64(ExFAT) filesystem driver.
+#
+
+ifneq ($(KERNELRELEASE),)
+# call from kernel build system
+
+obj-$(CONFIG_EXFAT_FS) += exfat.o
+
+exfat-objs := exfat_core.o exfat_super.o exfat_api.o exfat_blkdev.o exfat_cache.o \
+			   exfat_data.o exfat_bitmap.o exfat_nls.o exfat_oal.o exfat_upcase.o
+
+else
+# external module build
+
+EXTRA_FLAGS += -I$(PWD)
+
+#
+# KDIR is a path to a directory containing kernel source.
+# It can be specified on the command line passed to make to enable the module to
+# be built and installed for a kernel other than the one currently running.
+# By default it is the path to the symbolic link created when
+# the current kernel's modules were installed, but
+# any valid path to the directory in which the target kernel's source is located
+# can be provided on the command line.
+#
+KDIR	:= /lib/modules/$(shell uname -r)/build
+MDIR	:= /lib/modules/$(shell uname -r)
+PWD	:= $(shell pwd)
+KREL	:= $(shell cd ${KDIR} && make -s kernelrelease)
+PWD	:= $(shell pwd)
+
+export CONFIG_EXFAT_FS := m
+
+all:
+	$(MAKE) -C $(KDIR) M=$(PWD) modules
+
+clean:
+	$(MAKE) -C $(KDIR) M=$(PWD) clean
+
+help:
+	$(MAKE) -C $(KDIR) M=$(PWD) help
+
+install: exfat.ko
+	rm -f ${MDIR}/kernel/fs/exfat/exfat.ko
+	install -m644 -b -D exfat.ko ${MDIR}/kernel/fs/exfat/exfat.ko
+	depmod -aq
+
+uninstall:
+	rm -rf ${MDIR}/kernel/fs/exfat
+	depmod -aq
+
+endif
+
+.PHONY : all clean install uninstall
diff --git b/fs/exfat/README.md b/fs/exfat/README.md
new file mode 100644
index 0000000..3e1887f
--- /dev/null
+++ b/fs/exfat/README.md
@@ -0,0 +1,74 @@
+exfat-nofuse
+============
+
+Linux non-fuse read/write kernel driver for the exFAT, FAT12, FAT16 and vfat (FAT32) file systems.<br />
+Originally ported from Android kernel v3.0.
+
+Kudos to ksv1986 for the mutex patch!<br />
+Thanks to JackNorris for being awesome and providing the clear_inode() patch.<br />
+<br />
+Big thanks to lqs for completing the driver!<br />
+Big thanks to benpicco for fixing 3.11.y compatibility!
+
+
+Special thanks to github user AndreiLux for spreading the word about the leak!<br />
+
+
+Installing as a stand-alone module:
+====================================
+
+    make
+    sudo make install
+
+To load the driver manually, run this as root:
+
+    modprobe exfat
+
+You may also specify custom toolchains by using CROSS_COMPILE flag, in my case:
+>CROSS_COMPILE=../dorimanx-SG2-I9100-Kernel/android-toolchain/bin/arm-eabi-
+
+Installing as a part of the kernel:
+======================================
+
+Let's take [linux] as the path to your kernel source dir...
+
+	cd [linux]
+	cp -rvf exfat-nofuse [linux]/fs/exfat
+
+edit [linux]/fs/Kconfig
+```
+ menu "DOS/FAT/NT Filesystems"
+
+  source "fs/fat/Kconfig"
+ +source "fs/exfat/Kconfig"
+  source "fs/ntfs/Kconfig"
+  endmenu
+```
+  
+
+edit [linux]/fs/Makefile
+```
+  obj-$(CONFIG_FAT_FS)    += fat/
+ +obj-$(CONFIG_EXFAT_FS)  += exfat/
+  obj-$(CONFIG_BFS_FS)    += bfs/
+```
+
+	cd [linux]
+	make menuconfig
+
+Go to:
+> File systems > DOS/FAT/NT
+>   check exfat as MODULE (M)
+>   (437) Default codepage for exFAT
+>   (utf8) Default iocharset for exFAT
+
+> ESC to main menu
+> Save an Alternate Configuration File
+> ESC ESC
+
+build your kernel
+
+Have fun.
+
+Free Software for the Free Minds!
+=================================
diff --git b/fs/exfat/exfat_api.c b/fs/exfat/exfat_api.c
new file mode 100644
index 0000000..32b29f0
--- /dev/null
+++ b/fs/exfat/exfat_api.c
@@ -0,0 +1,528 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_api.c                                               */
+/*  PURPOSE : exFAT API Glue Layer                                      */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include "exfat_version.h"
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_super.h"
+#include "exfat_core.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions                                        */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Global Variable Definitions                                         */
+/*----------------------------------------------------------------------*/
+
+extern struct semaphore z_sem;
+
+/*----------------------------------------------------------------------*/
+/*  Local Variable Definitions                                          */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Local Function Declarations                                         */
+/*----------------------------------------------------------------------*/
+
+/*======================================================================*/
+/*  Global Function Definitions                                         */
+/*    - All functions for global use have same return value format,     */
+/*      that is, FFS_SUCCESS on success and several FS error code on    */
+/*      various error condition.                                        */
+/*======================================================================*/
+
+/*----------------------------------------------------------------------*/
+/*  exFAT Filesystem Init & Exit Functions                              */
+/*----------------------------------------------------------------------*/
+
+int FsInit(void)
+{
+	return ffsInit();
+}
+
+int FsShutdown(void)
+{
+	return ffsShutdown();
+}
+
+/*----------------------------------------------------------------------*/
+/*  Volume Management Functions                                         */
+/*----------------------------------------------------------------------*/
+
+/* FsMountVol : mount the file system volume */
+int FsMountVol(struct super_block *sb)
+{
+	int err;
+
+	sm_P(&z_sem);
+
+	err = buf_init(sb);
+	if (!err)
+		err = ffsMountVol(sb);
+	else
+		buf_shutdown(sb);
+
+	sm_V(&z_sem);
+
+	return err;
+} /* end of FsMountVol */
+
+/* FsUmountVol : unmount the file system volume */
+int FsUmountVol(struct super_block *sb)
+{
+	int err;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&z_sem);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsUmountVol(sb);
+	buf_shutdown(sb);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	sm_V(&z_sem);
+
+	return err;
+} /* end of FsUmountVol */
+
+/* FsGetVolInfo : get the information of a file system volume */
+int FsGetVolInfo(struct super_block *sb, VOL_INFO_T *info)
+{
+	int err;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if (info == NULL)
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsGetVolInfo(sb, info);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsGetVolInfo */
+
+/* FsSyncVol : synchronize a file system volume */
+int FsSyncVol(struct super_block *sb, int do_sync)
+{
+	int err;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsSyncVol(sb, do_sync);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsSyncVol */
+
+
+/*----------------------------------------------------------------------*/
+/*  File Operation Functions                                            */
+/*----------------------------------------------------------------------*/
+
+/* FsCreateFile : create a file */
+int FsLookupFile(struct inode *inode, char *path, FILE_ID_T *fid)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if ((fid == NULL) || (path == NULL) || (*path == '\0'))
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsLookupFile(inode, path, fid);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsLookupFile */
+
+/* FsCreateFile : create a file */
+int FsCreateFile(struct inode *inode, char *path, u8 mode, FILE_ID_T *fid)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if ((fid == NULL) || (path == NULL) || (*path == '\0'))
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsCreateFile(inode, path, mode, fid);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsCreateFile */
+
+int FsReadFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *rcount)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of the given file id */
+	if (fid == NULL)
+		return FFS_INVALIDFID;
+
+	/* check the validity of pointer parameters */
+	if (buffer == NULL)
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsReadFile(inode, fid, buffer, count, rcount);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsReadFile */
+
+int FsWriteFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *wcount)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of the given file id */
+	if (fid == NULL)
+		return FFS_INVALIDFID;
+
+	/* check the validity of pointer parameters */
+	if (buffer == NULL)
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsWriteFile(inode, fid, buffer, count, wcount);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsWriteFile */
+
+/* FsTruncateFile : resize the file length */
+int FsTruncateFile(struct inode *inode, u64 old_size, u64 new_size)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	DPRINTK("FsTruncateFile entered (inode %p size %llu)\n", inode, new_size);
+
+	err = ffsTruncateFile(inode, old_size, new_size);
+
+	DPRINTK("FsTruncateFile exitted (%d)\n", err);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsTruncateFile */
+
+/* FsMoveFile : move(rename) a old file into a new file */
+int FsMoveFile(struct inode *old_parent_inode, FILE_ID_T *fid, struct inode *new_parent_inode, struct dentry *new_dentry)
+{
+	int err;
+	struct super_block *sb = old_parent_inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of the given file id */
+	if (fid == NULL)
+		return FFS_INVALIDFID;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsMoveFile(old_parent_inode, fid, new_parent_inode, new_dentry);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsMoveFile */
+
+/* FsRemoveFile : remove a file */
+int FsRemoveFile(struct inode *inode, FILE_ID_T *fid)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of the given file id */
+	if (fid == NULL)
+		return FFS_INVALIDFID;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsRemoveFile(inode, fid);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsRemoveFile */
+
+/* FsSetAttr : set the attribute of a given file */
+int FsSetAttr(struct inode *inode, u32 attr)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsSetAttr(inode, attr);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsSetAttr */
+
+/* FsReadStat : get the information of a given file */
+int FsReadStat(struct inode *inode, DIR_ENTRY_T *info)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsGetStat(inode, info);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsReadStat */
+
+/* FsWriteStat : set the information of a given file */
+int FsWriteStat(struct inode *inode, DIR_ENTRY_T *info)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	DPRINTK("FsWriteStat entered (inode %p info %p\n", inode, info);
+
+	err = ffsSetStat(inode, info);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	DPRINTK("FsWriteStat exited (%d)\n", err);
+
+	return err;
+} /* end of FsWriteStat */
+
+/* FsMapCluster : return the cluster number in the given cluster offset */
+int FsMapCluster(struct inode *inode, s32 clu_offset, u32 *clu)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if (clu == NULL)
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsMapCluster(inode, clu_offset, clu);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsMapCluster */
+
+/*----------------------------------------------------------------------*/
+/*  Directory Operation Functions                                       */
+/*----------------------------------------------------------------------*/
+
+/* FsCreateDir : create(make) a directory */
+int FsCreateDir(struct inode *inode, char *path, FILE_ID_T *fid)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if ((fid == NULL) || (path == NULL) || (*path == '\0'))
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsCreateDir(inode, path, fid);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsCreateDir */
+
+/* FsReadDir : read a directory entry from the opened directory */
+int FsReadDir(struct inode *inode, DIR_ENTRY_T *dir_entry)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if (dir_entry == NULL)
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsReadDir(inode, dir_entry);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsReadDir */
+
+/* FsRemoveDir : remove a directory */
+int FsRemoveDir(struct inode *inode, FILE_ID_T *fid)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of the given file id */
+	if (fid == NULL)
+		return FFS_INVALIDFID;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsRemoveDir(inode, fid);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsRemoveDir */
+
+EXPORT_SYMBOL(FsMountVol);
+EXPORT_SYMBOL(FsUmountVol);
+EXPORT_SYMBOL(FsGetVolInfo);
+EXPORT_SYMBOL(FsSyncVol);
+EXPORT_SYMBOL(FsLookupFile);
+EXPORT_SYMBOL(FsCreateFile);
+EXPORT_SYMBOL(FsReadFile);
+EXPORT_SYMBOL(FsWriteFile);
+EXPORT_SYMBOL(FsTruncateFile);
+EXPORT_SYMBOL(FsMoveFile);
+EXPORT_SYMBOL(FsRemoveFile);
+EXPORT_SYMBOL(FsSetAttr);
+EXPORT_SYMBOL(FsReadStat);
+EXPORT_SYMBOL(FsWriteStat);
+EXPORT_SYMBOL(FsMapCluster);
+EXPORT_SYMBOL(FsCreateDir);
+EXPORT_SYMBOL(FsReadDir);
+EXPORT_SYMBOL(FsRemoveDir);
+
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+/* FsReleaseCache: Release FAT & buf cache */
+int FsReleaseCache(struct super_block *sb)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	FAT_release_all(sb);
+	buf_release_all(sb);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return 0;
+}
+/* FsReleaseCache */
+
+EXPORT_SYMBOL(FsReleaseCache);
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+/*======================================================================*/
+/*  Local Function Definitions                                          */
+/*======================================================================*/
diff --git b/fs/exfat/exfat_api.h b/fs/exfat/exfat_api.h
new file mode 100644
index 0000000..84bdf61
--- /dev/null
+++ b/fs/exfat/exfat_api.h
@@ -0,0 +1,206 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_api.h                                               */
+/*  PURPOSE : Header File for exFAT API Glue Layer                      */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_API_H
+#define _EXFAT_API_H
+
+#include <linux/fs.h>
+#include "exfat_config.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions                                        */
+/*----------------------------------------------------------------------*/
+
+#define EXFAT_SUPER_MAGIC       (0x2011BAB0L)
+#define EXFAT_ROOT_INO          1
+
+/* FAT types */
+#define FAT12                   0x01    /* FAT12 */
+#define FAT16                   0x0E    /* Win95 FAT16 (LBA) */
+#define FAT32                   0x0C    /* Win95 FAT32 (LBA) */
+#define EXFAT                   0x07    /* exFAT */
+
+/* file name lengths */
+#define MAX_CHARSET_SIZE        3       /* max size of multi-byte character	*/
+#define MAX_PATH_DEPTH          15      /* max depth of path name */
+#define MAX_NAME_LENGTH         256     /* max len of file name including NULL */
+#define MAX_PATH_LENGTH         260     /* max len of path name including NULL */
+#define DOS_NAME_LENGTH         11      /* DOS file name length excluding NULL */
+#define DOS_PATH_LENGTH         80      /* DOS path name length excluding NULL */
+
+/* file attributes */
+#define ATTR_NORMAL             0x0000
+#define ATTR_READONLY           0x0001
+#define ATTR_HIDDEN             0x0002
+#define ATTR_SYSTEM             0x0004
+#define ATTR_VOLUME             0x0008
+#define ATTR_SUBDIR             0x0010
+#define ATTR_ARCHIVE            0x0020
+#define ATTR_SYMLINK            0x0040
+#define ATTR_EXTEND             0x000F
+#define ATTR_RWMASK             0x007E
+
+/* file creation modes */
+#define FM_REGULAR              0x00
+#define FM_SYMLINK              0x40
+
+/* return values */
+#define FFS_SUCCESS             0
+#define FFS_MEDIAERR            1
+#define FFS_FORMATERR           2
+#define FFS_MOUNTED             3
+#define FFS_NOTMOUNTED          4
+#define FFS_ALIGNMENTERR        5
+#define FFS_SEMAPHOREERR        6
+#define FFS_INVALIDPATH         7
+#define FFS_INVALIDFID          8
+#define FFS_NOTFOUND            9
+#define FFS_FILEEXIST           10
+#define FFS_PERMISSIONERR       11
+#define FFS_NOTOPENED           12
+#define FFS_MAXOPENED           13
+#define FFS_FULL                14
+#define FFS_EOF                 15
+#define FFS_DIRBUSY             16
+#define FFS_MEMORYERR           17
+#define FFS_NAMETOOLONG		18
+#define FFS_ERROR               19
+
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+typedef struct {
+	u16      Year;
+	u16      Month;
+	u16      Day;
+	u16      Hour;
+	u16      Minute;
+	u16      Second;
+	u16      MilliSecond;
+} DATE_TIME_T;
+
+typedef struct {
+	u32      Offset;    /* start sector number of the partition */
+	u32      Size;      /* in sectors */
+} PART_INFO_T;
+
+typedef struct {
+	u32      SecSize;    /* sector size in bytes */
+	u32      DevSize;    /* block device size in sectors */
+} DEV_INFO_T;
+
+typedef struct {
+	u32      FatType;
+	u32      ClusterSize;
+	u32      NumClusters;
+	u32      FreeClusters;
+	u32      UsedClusters;
+} VOL_INFO_T;
+
+/* directory structure */
+typedef struct {
+	u32      dir;
+	s32       size;
+	u8       flags;
+} CHAIN_T;
+
+/* file id structure */
+typedef struct {
+	CHAIN_T     dir;
+	s32       entry;
+	u32      type;
+	u32      attr;
+	u32      start_clu;
+	u64      size;
+	u8       flags;
+	s64       rwoffset;
+	s32       hint_last_off;
+	u32      hint_last_clu;
+} FILE_ID_T;
+
+typedef struct {
+	char        Name[MAX_NAME_LENGTH * MAX_CHARSET_SIZE];
+	char        ShortName[DOS_NAME_LENGTH + 2];     /* used only for FAT12/16/32, not used for exFAT */
+	u32      Attr;
+	u64      Size;
+	u32      NumSubdirs;
+	DATE_TIME_T CreateTimestamp;
+	DATE_TIME_T ModifyTimestamp;
+	DATE_TIME_T AccessTimestamp;
+} DIR_ENTRY_T;
+
+/*======================================================================*/
+/*                                                                      */
+/*                     API FUNCTION DECLARATIONS                        */
+/*                  (CHANGE THIS PART IF REQUIRED)                      */
+/*                                                                      */
+/*======================================================================*/
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+/* file system initialization & shutdown functions */
+	int FsInit(void);
+	int FsShutdown(void);
+
+/* volume management functions */
+	int FsMountVol(struct super_block *sb);
+	int FsUmountVol(struct super_block *sb);
+	int FsGetVolInfo(struct super_block *sb, VOL_INFO_T *info);
+	int FsSyncVol(struct super_block *sb, int do_sync);
+
+/* file management functions */
+	int FsLookupFile(struct inode *inode, char *path, FILE_ID_T *fid);
+	int FsCreateFile(struct inode *inode, char *path, u8 mode, FILE_ID_T *fid);
+	int FsReadFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *rcount);
+	int FsWriteFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *wcount);
+	int FsTruncateFile(struct inode *inode, u64 old_size, u64 new_size);
+	int FsMoveFile(struct inode *old_parent_inode, FILE_ID_T *fid, struct inode *new_parent_inode, struct dentry *new_dentry);
+	int FsRemoveFile(struct inode *inode, FILE_ID_T *fid);
+	int FsSetAttr(struct inode *inode, u32 attr);
+	int FsReadStat(struct inode *inode, DIR_ENTRY_T *info);
+	int FsWriteStat(struct inode *inode, DIR_ENTRY_T *info);
+	int FsMapCluster(struct inode *inode, s32 clu_offset, u32 *clu);
+
+/* directory management functions */
+	int FsCreateDir(struct inode *inode, char *path, FILE_ID_T *fid);
+	int FsReadDir(struct inode *inode, DIR_ENTRY_T *dir_entry);
+	int FsRemoveDir(struct inode *inode, FILE_ID_T *fid);
+
+/* debug functions */
+s32 FsReleaseCache(struct super_block *sb);
+
+#endif /* _EXFAT_API_H */
diff --git b/fs/exfat/exfat_bitmap.c b/fs/exfat/exfat_bitmap.c
new file mode 100644
index 0000000..b0672dd
--- /dev/null
+++ b/fs/exfat/exfat_bitmap.c
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_global.c                                            */
+/*  PURPOSE : exFAT Miscellaneous Functions                             */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include "exfat_config.h"
+#include "exfat_bitmap.h"
+
+/*----------------------------------------------------------------------*/
+/*  Bitmap Manipulation Functions                                       */
+/*----------------------------------------------------------------------*/
+
+#define BITMAP_LOC(v)           ((v) >> 3)
+#define BITMAP_SHIFT(v)         ((v) & 0x07)
+
+s32 exfat_bitmap_test(u8 *bitmap, int i)
+{
+	u8 data;
+
+	data = bitmap[BITMAP_LOC(i)];
+	if ((data >> BITMAP_SHIFT(i)) & 0x01)
+		return 1;
+	return 0;
+} /* end of Bitmap_test */
+
+void exfat_bitmap_set(u8 *bitmap, int i)
+{
+	bitmap[BITMAP_LOC(i)] |= (0x01 << BITMAP_SHIFT(i));
+} /* end of Bitmap_set */
+
+void exfat_bitmap_clear(u8 *bitmap, int i)
+{
+	bitmap[BITMAP_LOC(i)] &= ~(0x01 << BITMAP_SHIFT(i));
+} /* end of Bitmap_clear */
diff --git b/fs/exfat/exfat_bitmap.h b/fs/exfat/exfat_bitmap.h
new file mode 100644
index 0000000..4f482c7
--- /dev/null
+++ b/fs/exfat/exfat_bitmap.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_global.h                                            */
+/*  PURPOSE : Header File for exFAT Global Definitions & Misc Functions */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_BITMAP_H
+#define _EXFAT_BITMAP_H
+
+#include <linux/types.h>
+
+/*======================================================================*/
+/*                                                                      */
+/*       LIBRARY FUNCTION DECLARATIONS -- OTHER UTILITY FUNCTIONS       */
+/*                    (DO NOT CHANGE THIS PART !!)                      */
+/*                                                                      */
+/*======================================================================*/
+
+/*----------------------------------------------------------------------*/
+/*  Bitmap Manipulation Functions                                       */
+/*----------------------------------------------------------------------*/
+
+s32	exfat_bitmap_test(u8 *bitmap, int i);
+void	exfat_bitmap_set(u8 *bitmap, int i);
+void	exfat_bitmap_clear(u8 *bitmpa, int i);
+
+#endif /* _EXFAT_BITMAP_H */
diff --git b/fs/exfat/exfat_blkdev.c b/fs/exfat/exfat_blkdev.c
new file mode 100644
index 0000000..02fa4fe
--- /dev/null
+++ b/fs/exfat/exfat_blkdev.c
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_blkdev.c                                            */
+/*  PURPOSE : exFAT Block Device Driver Glue Layer                      */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include <linux/blkdev.h>
+#include <linux/log2.h>
+#include "exfat_config.h"
+#include "exfat_blkdev.h"
+#include "exfat_data.h"
+#include "exfat_api.h"
+#include "exfat_super.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions                                        */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Global Variable Definitions                                         */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Local Variable Definitions                                          */
+/*----------------------------------------------------------------------*/
+
+/*======================================================================*/
+/*  Function Definitions                                                */
+/*======================================================================*/
+
+s32 bdev_init(void)
+{
+	return FFS_SUCCESS;
+}
+
+s32 bdev_shutdown(void)
+{
+	return FFS_SUCCESS;
+}
+
+s32 bdev_open(struct super_block *sb)
+{
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_bd->opened)
+		return FFS_SUCCESS;
+
+	p_bd->sector_size      = bdev_logical_block_size(sb->s_bdev);
+	p_bd->sector_size_bits = ilog2(p_bd->sector_size);
+	p_bd->sector_size_mask = p_bd->sector_size - 1;
+	p_bd->num_sectors      = i_size_read(sb->s_bdev->bd_inode) >> p_bd->sector_size_bits;
+
+	p_bd->opened = TRUE;
+
+	return FFS_SUCCESS;
+}
+
+s32 bdev_close(struct super_block *sb)
+{
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (!p_bd->opened)
+		return FFS_SUCCESS;
+
+	p_bd->opened = FALSE;
+	return FFS_SUCCESS;
+}
+
+s32 bdev_read(struct super_block *sb, u32 secno, struct buffer_head **bh, u32 num_secs, s32 read)
+{
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	long flags = sbi->debug_flags;
+
+	if (flags & EXFAT_DEBUGFLAGS_ERROR_RW)
+		return FFS_MEDIAERR;
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+	if (!p_bd->opened)
+		return FFS_MEDIAERR;
+
+	if (*bh)
+		__brelse(*bh);
+
+	if (read)
+		*bh = __bread(sb->s_bdev, secno, num_secs << p_bd->sector_size_bits);
+	else
+		*bh = __getblk(sb->s_bdev, secno, num_secs << p_bd->sector_size_bits);
+
+	if (*bh)
+		return FFS_SUCCESS;
+
+	WARN(!p_fs->dev_ejected,
+		"[EXFAT] No bh, device seems wrong or to be ejected.\n");
+
+	return FFS_MEDIAERR;
+}
+
+s32 bdev_write(struct super_block *sb, u32 secno, struct buffer_head *bh, u32 num_secs, s32 sync)
+{
+	s32 count;
+	struct buffer_head *bh2;
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	long flags = sbi->debug_flags;
+
+	if (flags & EXFAT_DEBUGFLAGS_ERROR_RW)
+		return FFS_MEDIAERR;
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+	if (!p_bd->opened)
+		return FFS_MEDIAERR;
+
+	if (secno == bh->b_blocknr) {
+		lock_buffer(bh);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		if (sync && (sync_dirty_buffer(bh) != 0))
+			return FFS_MEDIAERR;
+	} else {
+		count = num_secs << p_bd->sector_size_bits;
+
+		bh2 = __getblk(sb->s_bdev, secno, count);
+
+		if (bh2 == NULL)
+			goto no_bh;
+
+		lock_buffer(bh2);
+		memcpy(bh2->b_data, bh->b_data, count);
+		set_buffer_uptodate(bh2);
+		mark_buffer_dirty(bh2);
+		unlock_buffer(bh2);
+		if (sync && (sync_dirty_buffer(bh2) != 0)) {
+			__brelse(bh2);
+			goto no_bh;
+		}
+		__brelse(bh2);
+	}
+
+	return FFS_SUCCESS;
+
+no_bh:
+	WARN(!p_fs->dev_ejected,
+		"[EXFAT] No bh, device seems wrong or to be ejected.\n");
+
+	return FFS_MEDIAERR;
+}
+
+s32 bdev_sync(struct super_block *sb)
+{
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	long flags = sbi->debug_flags;
+
+	if (flags & EXFAT_DEBUGFLAGS_ERROR_RW)
+		return FFS_MEDIAERR;
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+	if (!p_bd->opened)
+		return FFS_MEDIAERR;
+
+	return sync_blockdev(sb->s_bdev);
+}
diff --git b/fs/exfat/exfat_blkdev.h b/fs/exfat/exfat_blkdev.h
new file mode 100644
index 0000000..169d25d
--- /dev/null
+++ b/fs/exfat/exfat_blkdev.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_blkdev.h                                            */
+/*  PURPOSE : Header File for exFAT Block Device Driver Glue Layer      */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_BLKDEV_H
+#define _EXFAT_BLKDEV_H
+
+#include <linux/fs.h>
+#include "exfat_config.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions (Non-Configurable)                     */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+typedef struct __BD_INFO_T {
+	s32 sector_size;      /* in bytes */
+	s32 sector_size_bits;
+	s32 sector_size_mask;
+	s32 num_sectors;      /* total number of sectors in this block device */
+	bool  opened;           /* opened or not */
+} BD_INFO_T;
+
+/*----------------------------------------------------------------------*/
+/*  External Variable Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+s32 bdev_init(void);
+s32 bdev_shutdown(void);
+s32 bdev_open(struct super_block *sb);
+s32 bdev_close(struct super_block *sb);
+s32 bdev_read(struct super_block *sb, u32 secno, struct buffer_head **bh, u32 num_secs, s32 read);
+s32 bdev_write(struct super_block *sb, u32 secno, struct buffer_head *bh, u32 num_secs, s32 sync);
+s32 bdev_sync(struct super_block *sb);
+
+#endif /* _EXFAT_BLKDEV_H */
diff --git b/fs/exfat/exfat_cache.c b/fs/exfat/exfat_cache.c
new file mode 100644
index 0000000..e6ca88b
--- /dev/null
+++ b/fs/exfat/exfat_cache.c
@@ -0,0 +1,780 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_cache.c                                             */
+/*  PURPOSE : exFAT Cache Manager                                       */
+/*            (FAT Cache & Buffer Cache)                                */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Sung-Kwan Kim] : first writing                        */
+/*                                                                      */
+/************************************************************************/
+
+#include "exfat_config.h"
+#include "exfat_data.h"
+
+#include "exfat_cache.h"
+#include "exfat_super.h"
+#include "exfat_core.h"
+
+/*----------------------------------------------------------------------*/
+/*  Global Variable Definitions                                         */
+/*----------------------------------------------------------------------*/
+
+#define sm_P(s)
+#define sm_V(s)
+
+static s32 __FAT_read(struct super_block *sb, u32 loc, u32 *content);
+static s32 __FAT_write(struct super_block *sb, u32 loc, u32 content);
+
+static BUF_CACHE_T *FAT_cache_find(struct super_block *sb, u32 sec);
+static BUF_CACHE_T *FAT_cache_get(struct super_block *sb, u32 sec);
+static void FAT_cache_insert_hash(struct super_block *sb, BUF_CACHE_T *bp);
+static void FAT_cache_remove_hash(BUF_CACHE_T *bp);
+
+static u8 *__buf_getblk(struct super_block *sb, u32 sec);
+
+static BUF_CACHE_T *buf_cache_find(struct super_block *sb, u32 sec);
+static BUF_CACHE_T *buf_cache_get(struct super_block *sb, u32 sec);
+static void buf_cache_insert_hash(struct super_block *sb, BUF_CACHE_T *bp);
+static void buf_cache_remove_hash(BUF_CACHE_T *bp);
+
+static void push_to_mru(BUF_CACHE_T *bp, BUF_CACHE_T *list);
+static void push_to_lru(BUF_CACHE_T *bp, BUF_CACHE_T *list);
+static void move_to_mru(BUF_CACHE_T *bp, BUF_CACHE_T *list);
+static void move_to_lru(BUF_CACHE_T *bp, BUF_CACHE_T *list);
+
+/*======================================================================*/
+/*  Cache Initialization Functions                                      */
+/*======================================================================*/
+
+s32 buf_init(struct super_block *sb)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	int i;
+
+	/* LRU list */
+	p_fs->FAT_cache_lru_list.next = p_fs->FAT_cache_lru_list.prev = &p_fs->FAT_cache_lru_list;
+
+	for (i = 0; i < FAT_CACHE_SIZE; i++) {
+		p_fs->FAT_cache_array[i].drv = -1;
+		p_fs->FAT_cache_array[i].sec = ~0;
+		p_fs->FAT_cache_array[i].flag = 0;
+		p_fs->FAT_cache_array[i].buf_bh = NULL;
+		p_fs->FAT_cache_array[i].prev = p_fs->FAT_cache_array[i].next = NULL;
+		push_to_mru(&(p_fs->FAT_cache_array[i]), &p_fs->FAT_cache_lru_list);
+	}
+
+	p_fs->buf_cache_lru_list.next = p_fs->buf_cache_lru_list.prev = &p_fs->buf_cache_lru_list;
+
+	for (i = 0; i < BUF_CACHE_SIZE; i++) {
+		p_fs->buf_cache_array[i].drv = -1;
+		p_fs->buf_cache_array[i].sec = ~0;
+		p_fs->buf_cache_array[i].flag = 0;
+		p_fs->buf_cache_array[i].buf_bh = NULL;
+		p_fs->buf_cache_array[i].prev = p_fs->buf_cache_array[i].next = NULL;
+		push_to_mru(&(p_fs->buf_cache_array[i]), &p_fs->buf_cache_lru_list);
+	}
+
+	/* HASH list */
+	for (i = 0; i < FAT_CACHE_HASH_SIZE; i++) {
+		p_fs->FAT_cache_hash_list[i].drv = -1;
+		p_fs->FAT_cache_hash_list[i].sec = ~0;
+		p_fs->FAT_cache_hash_list[i].hash_next = p_fs->FAT_cache_hash_list[i].hash_prev = &(p_fs->FAT_cache_hash_list[i]);
+	}
+
+	for (i = 0; i < FAT_CACHE_SIZE; i++)
+		FAT_cache_insert_hash(sb, &(p_fs->FAT_cache_array[i]));
+
+	for (i = 0; i < BUF_CACHE_HASH_SIZE; i++) {
+		p_fs->buf_cache_hash_list[i].drv = -1;
+		p_fs->buf_cache_hash_list[i].sec = ~0;
+		p_fs->buf_cache_hash_list[i].hash_next = p_fs->buf_cache_hash_list[i].hash_prev = &(p_fs->buf_cache_hash_list[i]);
+	}
+
+	for (i = 0; i < BUF_CACHE_SIZE; i++)
+		buf_cache_insert_hash(sb, &(p_fs->buf_cache_array[i]));
+
+	return FFS_SUCCESS;
+} /* end of buf_init */
+
+s32 buf_shutdown(struct super_block *sb)
+{
+	return FFS_SUCCESS;
+} /* end of buf_shutdown */
+
+/*======================================================================*/
+/*  FAT Read/Write Functions                                            */
+/*======================================================================*/
+
+/* in : sb, loc
+  * out: content
+  * returns 0 on success
+  *            -1 on error
+  */
+s32 FAT_read(struct super_block *sb, u32 loc, u32 *content)
+{
+	s32 ret;
+
+	sm_P(&f_sem);
+
+	ret = __FAT_read(sb, loc, content);
+
+	sm_V(&f_sem);
+
+	return ret;
+} /* end of FAT_read */
+
+s32 FAT_write(struct super_block *sb, u32 loc, u32 content)
+{
+	s32 ret;
+
+	sm_P(&f_sem);
+
+	ret = __FAT_write(sb, loc, content);
+
+	sm_V(&f_sem);
+
+	return ret;
+} /* end of FAT_write */
+
+static s32 __FAT_read(struct super_block *sb, u32 loc, u32 *content)
+{
+	s32 off;
+	u32 sec, _content;
+	u8 *fat_sector, *fat_entry;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_fs->vol_type == FAT12) {
+		sec = p_fs->FAT1_start_sector + ((loc + (loc >> 1)) >> p_bd->sector_size_bits);
+		off = (loc + (loc >> 1)) & p_bd->sector_size_mask;
+
+		if (off == (p_bd->sector_size-1)) {
+			fat_sector = FAT_getblk(sb, sec);
+			if (!fat_sector)
+				return -1;
+
+			_content  = (u32) fat_sector[off];
+
+			fat_sector = FAT_getblk(sb, ++sec);
+			if (!fat_sector)
+				return -1;
+
+			_content |= (u32) fat_sector[0] << 8;
+		} else {
+			fat_sector = FAT_getblk(sb, sec);
+			if (!fat_sector)
+				return -1;
+
+			fat_entry = &(fat_sector[off]);
+			_content = GET16(fat_entry);
+		}
+
+		if (loc & 1)
+			_content >>= 4;
+
+		_content &= 0x00000FFF;
+
+		if (_content >= CLUSTER_16(0x0FF8)) {
+			*content = CLUSTER_32(~0);
+			return 0;
+		} else {
+			*content = CLUSTER_32(_content);
+			return 0;
+		}
+	} else if (p_fs->vol_type == FAT16) {
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-1));
+		off = (loc << 1) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+
+		_content = GET16_A(fat_entry);
+
+		_content &= 0x0000FFFF;
+
+		if (_content >= CLUSTER_16(0xFFF8)) {
+			*content = CLUSTER_32(~0);
+			return 0;
+		} else {
+			*content = CLUSTER_32(_content);
+			return 0;
+		}
+	} else if (p_fs->vol_type == FAT32) {
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-2));
+		off = (loc << 2) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+
+		_content = GET32_A(fat_entry);
+
+		_content &= 0x0FFFFFFF;
+
+		if (_content >= CLUSTER_32(0x0FFFFFF8)) {
+			*content = CLUSTER_32(~0);
+			return 0;
+		} else {
+			*content = CLUSTER_32(_content);
+			return 0;
+		}
+	} else {
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-2));
+		off = (loc << 2) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+		_content = GET32_A(fat_entry);
+
+		if (_content >= CLUSTER_32(0xFFFFFFF8)) {
+			*content = CLUSTER_32(~0);
+			return 0;
+		} else {
+			*content = CLUSTER_32(_content);
+			return 0;
+		}
+	}
+
+	*content = CLUSTER_32(~0);
+	return 0;
+} /* end of __FAT_read */
+
+static s32 __FAT_write(struct super_block *sb, u32 loc, u32 content)
+{
+	s32 off;
+	u32 sec;
+	u8 *fat_sector, *fat_entry;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_fs->vol_type == FAT12) {
+
+		content &= 0x00000FFF;
+
+		sec = p_fs->FAT1_start_sector + ((loc + (loc >> 1)) >> p_bd->sector_size_bits);
+		off = (loc + (loc >> 1)) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		if (loc & 1) { /* odd */
+
+			content <<= 4;
+
+			if (off == (p_bd->sector_size-1)) {
+				fat_sector[off] = (u8)(content | (fat_sector[off] & 0x0F));
+				FAT_modify(sb, sec);
+
+				fat_sector = FAT_getblk(sb, ++sec);
+				if (!fat_sector)
+					return -1;
+
+				fat_sector[0] = (u8)(content >> 8);
+			} else {
+				fat_entry = &(fat_sector[off]);
+				content |= GET16(fat_entry) & 0x000F;
+
+				SET16(fat_entry, content);
+			}
+		} else { /* even */
+			fat_sector[off] = (u8)(content);
+
+			if (off == (p_bd->sector_size-1)) {
+				fat_sector[off] = (u8)(content);
+				FAT_modify(sb, sec);
+
+				fat_sector = FAT_getblk(sb, ++sec);
+				fat_sector[0] = (u8)((fat_sector[0] & 0xF0) | (content >> 8));
+			} else {
+				fat_entry = &(fat_sector[off]);
+				content |= GET16(fat_entry) & 0xF000;
+
+				SET16(fat_entry, content);
+			}
+		}
+	}
+
+	else if (p_fs->vol_type == FAT16) {
+
+		content &= 0x0000FFFF;
+
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-1));
+		off = (loc << 1) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+
+		SET16_A(fat_entry, content);
+	}
+
+	else if (p_fs->vol_type == FAT32) {
+
+		content &= 0x0FFFFFFF;
+
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-2));
+		off = (loc << 2) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+
+		content |= GET32_A(fat_entry) & 0xF0000000;
+
+		SET32_A(fat_entry, content);
+	}
+
+	else { /* p_fs->vol_type == EXFAT */
+
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-2));
+		off = (loc << 2) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+
+		SET32_A(fat_entry, content);
+	}
+
+	FAT_modify(sb, sec);
+	return 0;
+} /* end of __FAT_write */
+
+u8 *FAT_getblk(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	bp = FAT_cache_find(sb, sec);
+	if (bp != NULL) {
+		move_to_mru(bp, &p_fs->FAT_cache_lru_list);
+		return bp->buf_bh->b_data;
+	}
+
+	bp = FAT_cache_get(sb, sec);
+
+	FAT_cache_remove_hash(bp);
+
+	bp->drv = p_fs->drv;
+	bp->sec = sec;
+	bp->flag = 0;
+
+	FAT_cache_insert_hash(sb, bp);
+
+	if (sector_read(sb, sec, &(bp->buf_bh), 1) != FFS_SUCCESS) {
+		FAT_cache_remove_hash(bp);
+		bp->drv = -1;
+		bp->sec = ~0;
+		bp->flag = 0;
+		bp->buf_bh = NULL;
+
+		move_to_lru(bp, &p_fs->FAT_cache_lru_list);
+		return NULL;
+	}
+
+	return bp->buf_bh->b_data;
+} /* end of FAT_getblk */
+
+void FAT_modify(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+
+	bp = FAT_cache_find(sb, sec);
+	if (bp != NULL)
+		sector_write(sb, sec, bp->buf_bh, 0);
+} /* end of FAT_modify */
+
+void FAT_release_all(struct super_block *sb)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&f_sem);
+
+	bp = p_fs->FAT_cache_lru_list.next;
+	while (bp != &p_fs->FAT_cache_lru_list) {
+		if (bp->drv == p_fs->drv) {
+			bp->drv = -1;
+			bp->sec = ~0;
+			bp->flag = 0;
+
+			if (bp->buf_bh) {
+				__brelse(bp->buf_bh);
+				bp->buf_bh = NULL;
+			}
+		}
+		bp = bp->next;
+	}
+
+	sm_V(&f_sem);
+} /* end of FAT_release_all */
+
+void FAT_sync(struct super_block *sb)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&f_sem);
+
+	bp = p_fs->FAT_cache_lru_list.next;
+	while (bp != &p_fs->FAT_cache_lru_list) {
+		if ((bp->drv == p_fs->drv) && (bp->flag & DIRTYBIT)) {
+			sync_dirty_buffer(bp->buf_bh);
+			bp->flag &= ~(DIRTYBIT);
+		}
+		bp = bp->next;
+	}
+
+	sm_V(&f_sem);
+} /* end of FAT_sync */
+
+static BUF_CACHE_T *FAT_cache_find(struct super_block *sb, u32 sec)
+{
+	s32 off;
+	BUF_CACHE_T *bp, *hp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	off = (sec + (sec >> p_fs->sectors_per_clu_bits)) & (FAT_CACHE_HASH_SIZE - 1);
+
+	hp = &(p_fs->FAT_cache_hash_list[off]);
+	for (bp = hp->hash_next; bp != hp; bp = bp->hash_next) {
+		if ((bp->drv == p_fs->drv) && (bp->sec == sec)) {
+
+			WARN(!bp->buf_bh, "[EXFAT] FAT_cache has no bh. "
+					  "It will make system panic.\n");
+
+			touch_buffer(bp->buf_bh);
+			return bp;
+		}
+	}
+	return NULL;
+} /* end of FAT_cache_find */
+
+static BUF_CACHE_T *FAT_cache_get(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	bp = p_fs->FAT_cache_lru_list.prev;
+
+
+	move_to_mru(bp, &p_fs->FAT_cache_lru_list);
+	return bp;
+} /* end of FAT_cache_get */
+
+static void FAT_cache_insert_hash(struct super_block *sb, BUF_CACHE_T *bp)
+{
+	s32 off;
+	BUF_CACHE_T *hp;
+	FS_INFO_T *p_fs;
+
+	p_fs = &(EXFAT_SB(sb)->fs_info);
+	off = (bp->sec + (bp->sec >> p_fs->sectors_per_clu_bits)) & (FAT_CACHE_HASH_SIZE-1);
+
+	hp = &(p_fs->FAT_cache_hash_list[off]);
+	bp->hash_next = hp->hash_next;
+	bp->hash_prev = hp;
+	hp->hash_next->hash_prev = bp;
+	hp->hash_next = bp;
+} /* end of FAT_cache_insert_hash */
+
+static void FAT_cache_remove_hash(BUF_CACHE_T *bp)
+{
+	(bp->hash_prev)->hash_next = bp->hash_next;
+	(bp->hash_next)->hash_prev = bp->hash_prev;
+} /* end of FAT_cache_remove_hash */
+
+/*======================================================================*/
+/*  Buffer Read/Write Functions                                         */
+/*======================================================================*/
+
+u8 *buf_getblk(struct super_block *sb, u32 sec)
+{
+	u8 *buf;
+
+	sm_P(&b_sem);
+
+	buf = __buf_getblk(sb, sec);
+
+	sm_V(&b_sem);
+
+	return buf;
+} /* end of buf_getblk */
+
+static u8 *__buf_getblk(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	bp = buf_cache_find(sb, sec);
+	if (bp != NULL) {
+		move_to_mru(bp, &p_fs->buf_cache_lru_list);
+		return bp->buf_bh->b_data;
+	}
+
+	bp = buf_cache_get(sb, sec);
+
+	buf_cache_remove_hash(bp);
+
+	bp->drv = p_fs->drv;
+	bp->sec = sec;
+	bp->flag = 0;
+
+	buf_cache_insert_hash(sb, bp);
+
+	if (sector_read(sb, sec, &(bp->buf_bh), 1) != FFS_SUCCESS) {
+		buf_cache_remove_hash(bp);
+		bp->drv = -1;
+		bp->sec = ~0;
+		bp->flag = 0;
+		bp->buf_bh = NULL;
+
+		move_to_lru(bp, &p_fs->buf_cache_lru_list);
+		return NULL;
+	}
+
+	return bp->buf_bh->b_data;
+
+} /* end of __buf_getblk */
+
+void buf_modify(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+
+	sm_P(&b_sem);
+
+	bp = buf_cache_find(sb, sec);
+	if (likely(bp != NULL))
+		sector_write(sb, sec, bp->buf_bh, 0);
+
+	WARN(!bp, "[EXFAT] failed to find buffer_cache(sector:%u).\n", sec);
+
+	sm_V(&b_sem);
+} /* end of buf_modify */
+
+void buf_lock(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+
+	sm_P(&b_sem);
+
+	bp = buf_cache_find(sb, sec);
+	if (likely(bp != NULL))
+		bp->flag |= LOCKBIT;
+
+	WARN(!bp, "[EXFAT] failed to find buffer_cache(sector:%u).\n", sec);
+
+	sm_V(&b_sem);
+} /* end of buf_lock */
+
+void buf_unlock(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+
+	sm_P(&b_sem);
+
+	bp = buf_cache_find(sb, sec);
+	if (likely(bp != NULL))
+		bp->flag &= ~(LOCKBIT);
+
+	WARN(!bp, "[EXFAT] failed to find buffer_cache(sector:%u).\n", sec);
+
+	sm_V(&b_sem);
+} /* end of buf_unlock */
+
+void buf_release(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&b_sem);
+
+	bp = buf_cache_find(sb, sec);
+	if (likely(bp != NULL)) {
+		bp->drv = -1;
+		bp->sec = ~0;
+		bp->flag = 0;
+
+		if (bp->buf_bh) {
+			__brelse(bp->buf_bh);
+			bp->buf_bh = NULL;
+		}
+
+		move_to_lru(bp, &p_fs->buf_cache_lru_list);
+	}
+
+	sm_V(&b_sem);
+} /* end of buf_release */
+
+void buf_release_all(struct super_block *sb)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&b_sem);
+
+	bp = p_fs->buf_cache_lru_list.next;
+	while (bp != &p_fs->buf_cache_lru_list) {
+		if (bp->drv == p_fs->drv) {
+			bp->drv = -1;
+			bp->sec = ~0;
+			bp->flag = 0;
+
+			if (bp->buf_bh) {
+				__brelse(bp->buf_bh);
+				bp->buf_bh = NULL;
+			}
+		}
+		bp = bp->next;
+	}
+
+	sm_V(&b_sem);
+} /* end of buf_release_all */
+
+void buf_sync(struct super_block *sb)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&b_sem);
+
+	bp = p_fs->buf_cache_lru_list.next;
+	while (bp != &p_fs->buf_cache_lru_list) {
+		if ((bp->drv == p_fs->drv) && (bp->flag & DIRTYBIT)) {
+			sync_dirty_buffer(bp->buf_bh);
+			bp->flag &= ~(DIRTYBIT);
+		}
+		bp = bp->next;
+	}
+
+	sm_V(&b_sem);
+} /* end of buf_sync */
+
+static BUF_CACHE_T *buf_cache_find(struct super_block *sb, u32 sec)
+{
+	s32 off;
+	BUF_CACHE_T *bp, *hp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	off = (sec + (sec >> p_fs->sectors_per_clu_bits)) & (BUF_CACHE_HASH_SIZE - 1);
+
+	hp = &(p_fs->buf_cache_hash_list[off]);
+	for (bp = hp->hash_next; bp != hp; bp = bp->hash_next) {
+		if ((bp->drv == p_fs->drv) && (bp->sec == sec)) {
+			touch_buffer(bp->buf_bh);
+			return bp;
+		}
+	}
+	return NULL;
+} /* end of buf_cache_find */
+
+static BUF_CACHE_T *buf_cache_get(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	bp = p_fs->buf_cache_lru_list.prev;
+	while (bp->flag & LOCKBIT)
+		bp = bp->prev;
+
+
+	move_to_mru(bp, &p_fs->buf_cache_lru_list);
+	return bp;
+} /* end of buf_cache_get */
+
+static void buf_cache_insert_hash(struct super_block *sb, BUF_CACHE_T *bp)
+{
+	s32 off;
+	BUF_CACHE_T *hp;
+	FS_INFO_T *p_fs;
+
+	p_fs = &(EXFAT_SB(sb)->fs_info);
+	off = (bp->sec + (bp->sec >> p_fs->sectors_per_clu_bits)) & (BUF_CACHE_HASH_SIZE-1);
+
+	hp = &(p_fs->buf_cache_hash_list[off]);
+	bp->hash_next = hp->hash_next;
+	bp->hash_prev = hp;
+	hp->hash_next->hash_prev = bp;
+	hp->hash_next = bp;
+} /* end of buf_cache_insert_hash */
+
+static void buf_cache_remove_hash(BUF_CACHE_T *bp)
+{
+	(bp->hash_prev)->hash_next = bp->hash_next;
+	(bp->hash_next)->hash_prev = bp->hash_prev;
+} /* end of buf_cache_remove_hash */
+
+/*======================================================================*/
+/*  Local Function Definitions                                          */
+/*======================================================================*/
+
+static void push_to_mru(BUF_CACHE_T *bp, BUF_CACHE_T *list)
+{
+	bp->next = list->next;
+	bp->prev = list;
+	list->next->prev = bp;
+	list->next = bp;
+} /* end of buf_cache_push_to_mru */
+
+static void push_to_lru(BUF_CACHE_T *bp, BUF_CACHE_T *list)
+{
+	bp->prev = list->prev;
+	bp->next = list;
+	list->prev->next = bp;
+	list->prev = bp;
+} /* end of buf_cache_push_to_lru */
+
+static void move_to_mru(BUF_CACHE_T *bp, BUF_CACHE_T *list)
+{
+	bp->prev->next = bp->next;
+	bp->next->prev = bp->prev;
+	push_to_mru(bp, list);
+} /* end of buf_cache_move_to_mru */
+
+static void move_to_lru(BUF_CACHE_T *bp, BUF_CACHE_T *list)
+{
+	bp->prev->next = bp->next;
+	bp->next->prev = bp->prev;
+	push_to_lru(bp, list);
+} /* end of buf_cache_move_to_lru */
diff --git b/fs/exfat/exfat_cache.h b/fs/exfat/exfat_cache.h
new file mode 100644
index 0000000..d82aad5
--- /dev/null
+++ b/fs/exfat/exfat_cache.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_cache.h                                             */
+/*  PURPOSE : Header File for exFAT Cache Manager                       */
+/*            (FAT Cache & Buffer Cache)                                */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Sung-Kwan Kim] : first writing                        */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_CACHE_H
+#define _EXFAT_CACHE_H
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "exfat_config.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions                                        */
+/*----------------------------------------------------------------------*/
+
+#define LOCKBIT                 0x01
+#define DIRTYBIT                0x02
+
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+typedef struct __BUF_CACHE_T {
+	struct __BUF_CACHE_T *next;
+	struct __BUF_CACHE_T *prev;
+	struct __BUF_CACHE_T *hash_next;
+	struct __BUF_CACHE_T *hash_prev;
+	s32                drv;
+	u32               sec;
+	u32               flag;
+	struct buffer_head   *buf_bh;
+} BUF_CACHE_T;
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+s32  buf_init(struct super_block *sb);
+s32  buf_shutdown(struct super_block *sb);
+s32  FAT_read(struct super_block *sb, u32 loc, u32 *content);
+s32  FAT_write(struct super_block *sb, u32 loc, u32 content);
+u8 *FAT_getblk(struct super_block *sb, u32 sec);
+void   FAT_modify(struct super_block *sb, u32 sec);
+void   FAT_release_all(struct super_block *sb);
+void   FAT_sync(struct super_block *sb);
+u8 *buf_getblk(struct super_block *sb, u32 sec);
+void   buf_modify(struct super_block *sb, u32 sec);
+void   buf_lock(struct super_block *sb, u32 sec);
+void   buf_unlock(struct super_block *sb, u32 sec);
+void   buf_release(struct super_block *sb, u32 sec);
+void   buf_release_all(struct super_block *sb);
+void   buf_sync(struct super_block *sb);
+
+#endif /* _EXFAT_CACHE_H */
diff --git b/fs/exfat/exfat_config.h b/fs/exfat/exfat_config.h
new file mode 100644
index 0000000..33c6525
--- /dev/null
+++ b/fs/exfat/exfat_config.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_config.h                                            */
+/*  PURPOSE : Header File for exFAT Configuable Policies                */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_CONFIG_H
+#define _EXFAT_CONFIG_H
+
+/*======================================================================*/
+/*                                                                      */
+/*                        FFS CONFIGURATIONS                            */
+/*                  (CHANGE THIS PART IF REQUIRED)                      */
+/*                                                                      */
+/*======================================================================*/
+
+/*----------------------------------------------------------------------*/
+/* Feature Config                                                       */
+/*----------------------------------------------------------------------*/
+#ifndef CONFIG_EXFAT_DISCARD
+#define CONFIG_EXFAT_DISCARD		1	/* mount option -o discard support */
+#endif
+
+#ifndef CONFIG_EXFAT_DELAYED_SYNC
+#define CONFIG_EXFAT_DELAYED_SYNC 0
+#endif
+
+#ifndef CONFIG_EXFAT_KERNEL_DEBUG
+#define CONFIG_EXFAT_KERNEL_DEBUG	1	/* kernel debug features via ioctl */
+#endif
+
+#ifndef CONFIG_EXFAT_DEBUG_MSG
+#define CONFIG_EXFAT_DEBUG_MSG		0	/* debugging message on/off */
+#endif
+
+#ifndef CONFIG_EXFAT_DEFAULT_CODEPAGE
+#define CONFIG_EXFAT_DEFAULT_CODEPAGE	437
+#define CONFIG_EXFAT_DEFAULT_IOCHARSET	"utf8"
+#endif
+
+#endif /* _EXFAT_CONFIG_H */
diff --git b/fs/exfat/exfat_core.c b/fs/exfat/exfat_core.c
new file mode 100644
index 0000000..d485141
--- /dev/null
+++ b/fs/exfat/exfat_core.c
@@ -0,0 +1,5114 @@
+/* Some of the source code in this file came from "linux/fs/fat/misc.c".  */
+/*
+ *  linux/fs/fat/misc.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980
+ *         and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
+ */
+
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_core.c                                              */
+/*  PURPOSE : exFAT File Manager                                        */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include <linux/version.h>
+#include <linux/param.h>
+#include <linux/log2.h>
+
+#include "exfat_bitmap.h"
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+#include "exfat_blkdev.h"
+#include "exfat_cache.h"
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_super.h"
+#include "exfat_core.h"
+
+#include <linux/blkdev.h>
+
+static void __set_sb_dirty(struct super_block *sb)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	sb->s_dirt = 1;
+#else
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	sbi->s_dirt = 1;
+#endif
+}
+
+/*----------------------------------------------------------------------*/
+/*  Global Variable Definitions                                         */
+/*----------------------------------------------------------------------*/
+
+extern u8 uni_upcase[];
+
+/*----------------------------------------------------------------------*/
+/*  Local Variable Definitions                                          */
+/*----------------------------------------------------------------------*/
+
+static u8 name_buf[MAX_PATH_LENGTH * MAX_CHARSET_SIZE];
+
+static char *reserved_names[] = {
+	"AUX     ", "CON     ", "NUL     ", "PRN     ",
+	"COM1    ", "COM2    ", "COM3    ", "COM4    ",
+	"COM5    ", "COM6    ", "COM7    ", "COM8    ", "COM9    ",
+	"LPT1    ", "LPT2    ", "LPT3    ", "LPT4    ",
+	"LPT5    ", "LPT6    ", "LPT7    ", "LPT8    ", "LPT9    ",
+	NULL
+};
+
+static u8 free_bit[] = {
+	0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, /*   0 ~  19 */
+	0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, /*  20 ~  39 */
+	0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, /*  40 ~  59 */
+	0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, /*  60 ~  79 */
+	0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, /*  80 ~  99 */
+	0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, /* 100 ~ 119 */
+	0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, /* 120 ~ 139 */
+	0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, /* 140 ~ 159 */
+	0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, /* 160 ~ 179 */
+	0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, /* 180 ~ 199 */
+	0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, /* 200 ~ 219 */
+	0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, /* 220 ~ 239 */
+	0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0                 /* 240 ~ 254 */
+};
+
+static u8 used_bit[] = {
+	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, /*   0 ~  19 */
+	2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, /*  20 ~  39 */
+	2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, /*  40 ~  59 */
+	4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, /*  60 ~  79 */
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, /*  80 ~  99 */
+	3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, /* 100 ~ 119 */
+	4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, /* 120 ~ 139 */
+	3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, /* 140 ~ 159 */
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, /* 160 ~ 179 */
+	4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, /* 180 ~ 199 */
+	3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, /* 200 ~ 219 */
+	5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, /* 220 ~ 239 */
+	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8              /* 240 ~ 255 */
+};
+
+/*======================================================================*/
+/*  Global Function Definitions                                         */
+/*======================================================================*/
+
+/* ffsInit : roll back to the initial state of the file system */
+s32 ffsInit(void)
+{
+	s32 ret;
+
+	ret = bdev_init();
+	if (ret)
+		return ret;
+
+	ret = fs_init();
+	if (ret)
+		return ret;
+
+	return FFS_SUCCESS;
+} /* end of ffsInit */
+
+/* ffsShutdown : make free all memory-alloced global buffers */
+s32 ffsShutdown(void)
+{
+	s32 ret;
+	ret = fs_shutdown();
+	if (ret)
+		return ret;
+
+	ret = bdev_shutdown();
+	if (ret)
+		return ret;
+
+	return FFS_SUCCESS;
+} /* end of ffsShutdown */
+
+/* ffsMountVol : mount the file system volume */
+s32 ffsMountVol(struct super_block *sb)
+{
+	int i, ret;
+	PBR_SECTOR_T *p_pbr;
+	struct buffer_head *tmp_bh = NULL;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	printk("[EXFAT] trying to mount...\n");
+
+	sm_init(&p_fs->v_sem);
+	p_fs->dev_ejected = FALSE;
+
+	/* open the block device */
+	if (bdev_open(sb))
+		return FFS_MEDIAERR;
+
+	if (p_bd->sector_size < sb->s_blocksize)
+		return FFS_MEDIAERR;
+	if (p_bd->sector_size > sb->s_blocksize)
+		sb_set_blocksize(sb, p_bd->sector_size);
+
+	/* read Sector 0 */
+	if (sector_read(sb, 0, &tmp_bh, 1) != FFS_SUCCESS)
+		return FFS_MEDIAERR;
+
+		p_fs->PBR_sector = 0;
+
+	p_pbr = (PBR_SECTOR_T *) tmp_bh->b_data;
+
+	/* check the validity of PBR */
+	if (GET16_A(p_pbr->signature) != PBR_SIGNATURE) {
+		brelse(tmp_bh);
+		bdev_close(sb);
+		return FFS_FORMATERR;
+	}
+
+	/* fill fs_stuct */
+	for (i = 0; i < 53; i++)
+		if (p_pbr->bpb[i])
+			break;
+
+	if (i < 53) {
+		if (GET16(p_pbr->bpb+11)) /* num_fat_sectors */
+			ret = fat16_mount(sb, p_pbr);
+		else
+			ret = fat32_mount(sb, p_pbr);
+	} else {
+		ret = exfat_mount(sb, p_pbr);
+	}
+
+	brelse(tmp_bh);
+
+	if (ret) {
+		bdev_close(sb);
+		return ret;
+	}
+
+	if (p_fs->vol_type == EXFAT) {
+		ret = load_alloc_bitmap(sb);
+		if (ret) {
+			bdev_close(sb);
+			return ret;
+		}
+		ret = load_upcase_table(sb);
+		if (ret) {
+			free_alloc_bitmap(sb);
+			bdev_close(sb);
+			return ret;
+		}
+	}
+
+	if (p_fs->dev_ejected) {
+		if (p_fs->vol_type == EXFAT) {
+			free_upcase_table(sb);
+			free_alloc_bitmap(sb);
+		}
+		bdev_close(sb);
+		return FFS_MEDIAERR;
+	}
+
+	printk("[EXFAT] mounted successfully\n");
+
+	return FFS_SUCCESS;
+} /* end of ffsMountVol */
+
+/* ffsUmountVol : umount the file system volume */
+s32 ffsUmountVol(struct super_block *sb)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	printk("[EXFAT] trying to unmount...\n");
+
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+
+	if (p_fs->vol_type == EXFAT) {
+		free_upcase_table(sb);
+		free_alloc_bitmap(sb);
+	}
+
+	FAT_release_all(sb);
+	buf_release_all(sb);
+
+	/* close the block device */
+	bdev_close(sb);
+
+	if (p_fs->dev_ejected) {
+		printk("[EXFAT] unmounted with media errors. "
+			"device's already ejected.\n");
+		return FFS_MEDIAERR;
+	}
+
+	printk("[EXFAT] unmounted successfully\n");
+
+	return FFS_SUCCESS;
+} /* end of ffsUmountVol */
+
+/* ffsGetVolInfo : get the information of a file system volume */
+s32 ffsGetVolInfo(struct super_block *sb, VOL_INFO_T *info)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_fs->used_clusters == (u32) ~0)
+		p_fs->used_clusters = p_fs->fs_func->count_used_clusters(sb);
+
+	info->FatType = p_fs->vol_type;
+	info->ClusterSize = p_fs->cluster_size;
+	info->NumClusters = p_fs->num_clusters - 2; /* clu 0 & 1 */
+	info->UsedClusters = p_fs->used_clusters;
+	info->FreeClusters = info->NumClusters - info->UsedClusters;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsGetVolInfo */
+
+/* ffsSyncVol : synchronize all file system volumes */
+s32 ffsSyncVol(struct super_block *sb, s32 do_sync)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* synchronize the file system */
+	fs_sync(sb, do_sync);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsSyncVol */
+
+/*----------------------------------------------------------------------*/
+/*  File Operation Functions                                            */
+/*----------------------------------------------------------------------*/
+
+/* ffsLookupFile : lookup a file */
+s32 ffsLookupFile(struct inode *inode, char *path, FILE_ID_T *fid)
+{
+	s32 ret, dentry, num_entries;
+	CHAIN_T dir;
+	UNI_NAME_T uni_name;
+	DOS_NAME_T dos_name;
+	DENTRY_T *ep, *ep2;
+	ENTRY_SET_CACHE_T *es = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	DPRINTK("ffsLookupFile entered\n");
+
+	/* check the validity of directory name in the given pathname */
+	ret = resolve_path(inode, path, &dir, &uni_name);
+	if (ret)
+		return ret;
+
+	ret = get_num_entries_and_dos_name(sb, &dir, &uni_name, &num_entries, &dos_name);
+	if (ret)
+		return ret;
+
+	/* search the file name for directories */
+	dentry = p_fs->fs_func->find_dir_entry(sb, &dir, &uni_name, num_entries, &dos_name, TYPE_ALL);
+	if (dentry < -1)
+		return FFS_NOTFOUND;
+
+	fid->dir.dir = dir.dir;
+	fid->dir.size = dir.size;
+	fid->dir.flags = dir.flags;
+	fid->entry = dentry;
+
+	if (dentry == -1) {
+		fid->type = TYPE_DIR;
+		fid->rwoffset = 0;
+		fid->hint_last_off = -1;
+
+		fid->attr = ATTR_SUBDIR;
+		fid->flags = 0x01;
+		fid->size = 0;
+		fid->start_clu = p_fs->root_dir;
+	} else {
+		if (p_fs->vol_type == EXFAT) {
+			es = get_entry_set_in_dir(sb, &dir, dentry, ES_2_ENTRIES, &ep);
+			if (!es)
+				return FFS_MEDIAERR;
+			ep2 = ep+1;
+		} else {
+			ep = get_entry_in_dir(sb, &dir, dentry, NULL);
+			if (!ep)
+				return FFS_MEDIAERR;
+			ep2 = ep;
+		}
+
+		fid->type = p_fs->fs_func->get_entry_type(ep);
+		fid->rwoffset = 0;
+		fid->hint_last_off = -1;
+		fid->attr = p_fs->fs_func->get_entry_attr(ep);
+
+		fid->size = p_fs->fs_func->get_entry_size(ep2);
+		if ((fid->type == TYPE_FILE) && (fid->size == 0)) {
+			fid->flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+			fid->start_clu = CLUSTER_32(~0);
+		} else {
+			fid->flags = p_fs->fs_func->get_entry_flag(ep2);
+			fid->start_clu = p_fs->fs_func->get_entry_clu0(ep2);
+		}
+
+		if (p_fs->vol_type == EXFAT)
+			release_entry_set(es);
+	}
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	DPRINTK("ffsLookupFile exited successfully\n");
+
+	return FFS_SUCCESS;
+} /* end of ffsLookupFile */
+
+/* ffsCreateFile : create a file */
+s32 ffsCreateFile(struct inode *inode, char *path, u8 mode, FILE_ID_T *fid)
+{
+	s32 ret/*, dentry*/;
+	CHAIN_T dir;
+	UNI_NAME_T uni_name;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of directory name in the given pathname */
+	ret = resolve_path(inode, path, &dir, &uni_name);
+	if (ret)
+		return ret;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	/* create a new file */
+	ret = create_file(inode, &dir, &uni_name, mode, fid);
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return ret;
+} /* end of ffsCreateFile */
+
+/* ffsReadFile : read data from a opened file */
+s32 ffsReadFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *rcount)
+{
+	s32 offset, sec_offset, clu_offset;
+	u32 clu, LogSector;
+	u64 oneblkread, read_bytes;
+	struct buffer_head *tmp_bh = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	/* check if the given file ID is opened */
+	if (fid->type != TYPE_FILE)
+		return FFS_PERMISSIONERR;
+
+	if (fid->rwoffset > fid->size)
+		fid->rwoffset = fid->size;
+
+	if (count > (fid->size - fid->rwoffset))
+		count = fid->size - fid->rwoffset;
+
+	if (count == 0) {
+		if (rcount != NULL)
+			*rcount = 0;
+		return FFS_EOF;
+	}
+
+	read_bytes = 0;
+
+	while (count > 0) {
+		clu_offset = (s32)(fid->rwoffset >> p_fs->cluster_size_bits);
+		clu = fid->start_clu;
+
+		if (fid->flags == 0x03) {
+			clu += clu_offset;
+		} else {
+			/* hint information */
+			if ((clu_offset > 0) && (fid->hint_last_off > 0) &&
+				(clu_offset >= fid->hint_last_off)) {
+				clu_offset -= fid->hint_last_off;
+				clu = fid->hint_last_clu;
+			}
+
+			while (clu_offset > 0) {
+				/* clu = FAT_read(sb, clu); */
+				if (FAT_read(sb, clu, &clu) == -1)
+					return FFS_MEDIAERR;
+
+				clu_offset--;
+			}
+		}
+
+		/* hint information */
+		fid->hint_last_off = (s32)(fid->rwoffset >> p_fs->cluster_size_bits);
+		fid->hint_last_clu = clu;
+
+		offset = (s32)(fid->rwoffset & (p_fs->cluster_size-1)); /* byte offset in cluster   */
+		sec_offset = offset >> p_bd->sector_size_bits;            /* sector offset in cluster */
+		offset &= p_bd->sector_size_mask;                         /* byte offset in sector    */
+
+		LogSector = START_SECTOR(clu) + sec_offset;
+
+		oneblkread = (u64)(p_bd->sector_size - offset);
+		if (oneblkread > count)
+			oneblkread = count;
+
+		if ((offset == 0) && (oneblkread == p_bd->sector_size)) {
+			if (sector_read(sb, LogSector, &tmp_bh, 1) != FFS_SUCCESS)
+				goto err_out;
+			memcpy(((char *) buffer)+read_bytes, ((char *) tmp_bh->b_data), (s32) oneblkread);
+		} else {
+			if (sector_read(sb, LogSector, &tmp_bh, 1) != FFS_SUCCESS)
+				goto err_out;
+			memcpy(((char *) buffer)+read_bytes, ((char *) tmp_bh->b_data)+offset, (s32) oneblkread);
+		}
+		count -= oneblkread;
+		read_bytes += oneblkread;
+		fid->rwoffset += oneblkread;
+	}
+	brelse(tmp_bh);
+
+err_out:
+	/* set the size of read bytes */
+	if (rcount != NULL)
+		*rcount = read_bytes;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsReadFile */
+
+/* ffsWriteFile : write data into a opened file */
+s32 ffsWriteFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *wcount)
+{
+	s32 modified = FALSE, offset, sec_offset, clu_offset;
+	s32 num_clusters, num_alloc, num_alloced = (s32) ~0;
+	u32 clu, last_clu, LogSector, sector = 0;
+	u64 oneblkwrite, write_bytes;
+	CHAIN_T new_clu;
+	TIMESTAMP_T tm;
+	DENTRY_T *ep, *ep2;
+	ENTRY_SET_CACHE_T *es = NULL;
+	struct buffer_head *tmp_bh = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	/* check if the given file ID is opened */
+	if (fid->type != TYPE_FILE)
+		return FFS_PERMISSIONERR;
+
+	if (fid->rwoffset > fid->size)
+		fid->rwoffset = fid->size;
+
+	if (count == 0) {
+		if (wcount != NULL)
+			*wcount = 0;
+		return FFS_SUCCESS;
+	}
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	if (fid->size == 0)
+		num_clusters = 0;
+	else
+		num_clusters = (s32)((fid->size-1) >> p_fs->cluster_size_bits) + 1;
+
+	write_bytes = 0;
+
+	while (count > 0) {
+		clu_offset = (s32)(fid->rwoffset >> p_fs->cluster_size_bits);
+		clu = last_clu = fid->start_clu;
+
+		if (fid->flags == 0x03) {
+			if ((clu_offset > 0) && (clu != CLUSTER_32(~0))) {
+				last_clu += clu_offset - 1;
+
+				if (clu_offset == num_clusters)
+					clu = CLUSTER_32(~0);
+				else
+					clu += clu_offset;
+			}
+		} else {
+			/* hint information */
+			if ((clu_offset > 0) && (fid->hint_last_off > 0) &&
+				(clu_offset >= fid->hint_last_off)) {
+				clu_offset -= fid->hint_last_off;
+				clu = fid->hint_last_clu;
+			}
+
+			while ((clu_offset > 0) && (clu != CLUSTER_32(~0))) {
+				last_clu = clu;
+				/* clu = FAT_read(sb, clu); */
+				if (FAT_read(sb, clu, &clu) == -1)
+					return FFS_MEDIAERR;
+
+				clu_offset--;
+			}
+		}
+
+		if (clu == CLUSTER_32(~0)) {
+			num_alloc = (s32)((count-1) >> p_fs->cluster_size_bits) + 1;
+			new_clu.dir = (last_clu == CLUSTER_32(~0)) ? CLUSTER_32(~0) : last_clu+1;
+			new_clu.size = 0;
+			new_clu.flags = fid->flags;
+
+			/* (1) allocate a chain of clusters */
+			num_alloced = p_fs->fs_func->alloc_cluster(sb, num_alloc, &new_clu);
+			if (num_alloced == 0)
+				break;
+			else if (num_alloced < 0)
+				return FFS_MEDIAERR;
+
+			/* (2) append to the FAT chain */
+			if (last_clu == CLUSTER_32(~0)) {
+				if (new_clu.flags == 0x01)
+					fid->flags = 0x01;
+				fid->start_clu = new_clu.dir;
+				modified = TRUE;
+			} else {
+				if (new_clu.flags != fid->flags) {
+					exfat_chain_cont_cluster(sb, fid->start_clu, num_clusters);
+					fid->flags = 0x01;
+					modified = TRUE;
+				}
+				if (new_clu.flags == 0x01)
+					FAT_write(sb, last_clu, new_clu.dir);
+			}
+
+			num_clusters += num_alloced;
+			clu = new_clu.dir;
+		}
+
+		/* hint information */
+		fid->hint_last_off = (s32)(fid->rwoffset >> p_fs->cluster_size_bits);
+		fid->hint_last_clu = clu;
+
+		offset = (s32)(fid->rwoffset & (p_fs->cluster_size-1)); /* byte offset in cluster   */
+		sec_offset = offset >> p_bd->sector_size_bits;            /* sector offset in cluster */
+		offset &= p_bd->sector_size_mask;                         /* byte offset in sector    */
+
+		LogSector = START_SECTOR(clu) + sec_offset;
+
+		oneblkwrite = (u64)(p_bd->sector_size - offset);
+		if (oneblkwrite > count)
+			oneblkwrite = count;
+
+		if ((offset == 0) && (oneblkwrite == p_bd->sector_size)) {
+			if (sector_read(sb, LogSector, &tmp_bh, 0) != FFS_SUCCESS)
+				goto err_out;
+			memcpy(((char *) tmp_bh->b_data), ((char *) buffer)+write_bytes, (s32) oneblkwrite);
+			if (sector_write(sb, LogSector, tmp_bh, 0) != FFS_SUCCESS) {
+				brelse(tmp_bh);
+				goto err_out;
+			}
+		} else {
+			if ((offset > 0) || ((fid->rwoffset+oneblkwrite) < fid->size)) {
+				if (sector_read(sb, LogSector, &tmp_bh, 1) != FFS_SUCCESS)
+					goto err_out;
+			} else {
+				if (sector_read(sb, LogSector, &tmp_bh, 0) != FFS_SUCCESS)
+					goto err_out;
+			}
+
+			memcpy(((char *) tmp_bh->b_data)+offset, ((char *) buffer)+write_bytes, (s32) oneblkwrite);
+			if (sector_write(sb, LogSector, tmp_bh, 0) != FFS_SUCCESS) {
+				brelse(tmp_bh);
+				goto err_out;
+			}
+		}
+
+		count -= oneblkwrite;
+		write_bytes += oneblkwrite;
+		fid->rwoffset += oneblkwrite;
+
+		fid->attr |= ATTR_ARCHIVE;
+
+		if (fid->size < fid->rwoffset) {
+			fid->size = fid->rwoffset;
+			modified = TRUE;
+		}
+	}
+
+	brelse(tmp_bh);
+
+	/* (3) update the direcoty entry */
+	if (p_fs->vol_type == EXFAT) {
+		es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_ALL_ENTRIES, &ep);
+		if (es == NULL)
+			goto err_out;
+		ep2 = ep+1;
+	} else {
+		ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+		if (!ep)
+			goto err_out;
+		ep2 = ep;
+	}
+
+	p_fs->fs_func->set_entry_time(ep, tm_current(&tm), TM_MODIFY);
+	p_fs->fs_func->set_entry_attr(ep, fid->attr);
+
+	if (p_fs->vol_type != EXFAT)
+		buf_modify(sb, sector);
+
+	if (modified) {
+		if (p_fs->fs_func->get_entry_flag(ep2) != fid->flags)
+			p_fs->fs_func->set_entry_flag(ep2, fid->flags);
+
+		if (p_fs->fs_func->get_entry_size(ep2) != fid->size)
+			p_fs->fs_func->set_entry_size(ep2, fid->size);
+
+		if (p_fs->fs_func->get_entry_clu0(ep2) != fid->start_clu)
+			p_fs->fs_func->set_entry_clu0(ep2, fid->start_clu);
+
+		if (p_fs->vol_type != EXFAT)
+			buf_modify(sb, sector);
+	}
+
+	if (p_fs->vol_type == EXFAT) {
+		update_dir_checksum_with_entry_set(sb, es);
+		release_entry_set(es);
+	}
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+err_out:
+	/* set the size of written bytes */
+	if (wcount != NULL)
+		*wcount = write_bytes;
+
+	if (num_alloced == 0)
+		return FFS_FULL;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsWriteFile */
+
+/* ffsTruncateFile : resize the file length */
+s32 ffsTruncateFile(struct inode *inode, u64 old_size, u64 new_size)
+{
+	s32 num_clusters;
+	u32 last_clu = CLUSTER_32(0), sector = 0;
+	CHAIN_T clu;
+	TIMESTAMP_T tm;
+	DENTRY_T *ep, *ep2;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+	ENTRY_SET_CACHE_T *es = NULL;
+
+	/* check if the given file ID is opened */
+	if (fid->type != TYPE_FILE)
+		return FFS_PERMISSIONERR;
+
+	if (fid->size != old_size) {
+		printk(KERN_ERR "[EXFAT] truncate : can't skip it because of "
+				"size-mismatch(old:%lld->fid:%lld).\n"
+				,old_size, fid->size);
+	}
+
+	if (old_size <= new_size)
+		return FFS_SUCCESS;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	clu.dir = fid->start_clu;
+	clu.size = (s32)((old_size-1) >> p_fs->cluster_size_bits) + 1;
+	clu.flags = fid->flags;
+
+	if (new_size > 0) {
+		num_clusters = (s32)((new_size-1) >> p_fs->cluster_size_bits) + 1;
+
+		if (clu.flags == 0x03) {
+			clu.dir += num_clusters;
+		} else {
+			while (num_clusters > 0) {
+				last_clu = clu.dir;
+				if (FAT_read(sb, clu.dir, &(clu.dir)) == -1)
+					return FFS_MEDIAERR;
+				num_clusters--;
+			}
+		}
+
+		clu.size -= num_clusters;
+	}
+
+	fid->size = new_size;
+	fid->attr |= ATTR_ARCHIVE;
+	if (new_size == 0) {
+		fid->flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+		fid->start_clu = CLUSTER_32(~0);
+	}
+
+	/* (1) update the directory entry */
+	if (p_fs->vol_type == EXFAT) {
+		es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_ALL_ENTRIES, &ep);
+		if (es == NULL)
+			return FFS_MEDIAERR;
+		ep2 = ep+1;
+	} else {
+		ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+		if (!ep)
+			return FFS_MEDIAERR;
+		ep2 = ep;
+	}
+
+	p_fs->fs_func->set_entry_time(ep, tm_current(&tm), TM_MODIFY);
+	p_fs->fs_func->set_entry_attr(ep, fid->attr);
+
+	p_fs->fs_func->set_entry_size(ep2, new_size);
+	if (new_size == 0) {
+		p_fs->fs_func->set_entry_flag(ep2, 0x01);
+		p_fs->fs_func->set_entry_clu0(ep2, CLUSTER_32(0));
+	}
+
+	if (p_fs->vol_type != EXFAT)
+		buf_modify(sb, sector);
+	else {
+		update_dir_checksum_with_entry_set(sb, es);
+		release_entry_set(es);
+	}
+
+	/* (2) cut off from the FAT chain */
+	if (last_clu != CLUSTER_32(0)) {
+		if (fid->flags == 0x01)
+			FAT_write(sb, last_clu, CLUSTER_32(~0));
+	}
+
+	/* (3) free the clusters */
+	p_fs->fs_func->free_cluster(sb, &clu, 0);
+
+	/* hint information */
+	fid->hint_last_off = -1;
+	if (fid->rwoffset > fid->size)
+		fid->rwoffset = fid->size;
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsTruncateFile */
+
+static void update_parent_info(FILE_ID_T *fid, struct inode *parent_inode)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(parent_inode->i_sb)->fs_info);
+	FILE_ID_T *parent_fid = &(EXFAT_I(parent_inode)->fid);
+
+	if (unlikely((parent_fid->flags != fid->dir.flags)
+		|| (parent_fid->size != (fid->dir.size<<p_fs->cluster_size_bits))
+		|| (parent_fid->start_clu != fid->dir.dir))) {
+
+		fid->dir.dir = parent_fid->start_clu;
+		fid->dir.flags = parent_fid->flags;
+		fid->dir.size = ((parent_fid->size + (p_fs->cluster_size-1))
+						>> p_fs->cluster_size_bits);
+	}
+}
+
+/* ffsMoveFile : move(rename) a old file into a new file */
+s32 ffsMoveFile(struct inode *old_parent_inode, FILE_ID_T *fid, struct inode *new_parent_inode, struct dentry *new_dentry)
+{
+	s32 ret;
+	s32 dentry;
+	CHAIN_T olddir, newdir;
+	CHAIN_T *p_dir = NULL;
+	UNI_NAME_T uni_name;
+	DENTRY_T *ep;
+	struct super_block *sb = old_parent_inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	u8 *new_path = (u8 *) new_dentry->d_name.name;
+	struct inode *new_inode = new_dentry->d_inode;
+	int num_entries;
+	FILE_ID_T *new_fid = NULL;
+	s32 new_entry = 0;
+
+	/* check the validity of pointer parameters */
+	if ((new_path == NULL) || (*new_path == '\0'))
+		return FFS_ERROR;
+
+	update_parent_info(fid, old_parent_inode);
+
+	olddir.dir = fid->dir.dir;
+	olddir.size = fid->dir.size;
+	olddir.flags = fid->dir.flags;
+
+	dentry = fid->entry;
+
+	/* check if the old file is "." or ".." */
+	if (p_fs->vol_type != EXFAT) {
+		if ((olddir.dir != p_fs->root_dir) && (dentry < 2))
+			return FFS_PERMISSIONERR;
+	}
+
+	ep = get_entry_in_dir(sb, &olddir, dentry, NULL);
+	if (!ep)
+		return FFS_MEDIAERR;
+
+	if (p_fs->fs_func->get_entry_attr(ep) & ATTR_READONLY)
+		return FFS_PERMISSIONERR;
+
+	/* check whether new dir is existing directory and empty */
+	if (new_inode) {
+		u32 entry_type;
+
+		ret = FFS_MEDIAERR;
+		new_fid = &EXFAT_I(new_inode)->fid;
+
+		update_parent_info(new_fid, new_parent_inode);
+
+		p_dir = &(new_fid->dir);
+		new_entry = new_fid->entry;
+		ep = get_entry_in_dir(sb, p_dir, new_entry, NULL);
+		if (!ep)
+			goto out;
+
+		entry_type = p_fs->fs_func->get_entry_type(ep);
+
+		if (entry_type == TYPE_DIR) {
+			CHAIN_T new_clu;
+			new_clu.dir = new_fid->start_clu;
+			new_clu.size = (s32)((new_fid->size-1) >> p_fs->cluster_size_bits) + 1;
+			new_clu.flags = new_fid->flags;
+
+			if (!is_dir_empty(sb, &new_clu))
+				return FFS_FILEEXIST;
+		}
+	}
+
+	/* check the validity of directory name in the given new pathname */
+	ret = resolve_path(new_parent_inode, new_path, &newdir, &uni_name);
+	if (ret)
+		return ret;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	if (olddir.dir == newdir.dir)
+		ret = rename_file(new_parent_inode, &olddir, dentry, &uni_name, fid);
+	else
+		ret = move_file(new_parent_inode, &olddir, dentry, &newdir, &uni_name, fid);
+
+	if ((ret == FFS_SUCCESS) && new_inode) {
+		/* delete entries of new_dir */
+		ep = get_entry_in_dir(sb, p_dir, new_entry, NULL);
+		if (!ep)
+			goto out;
+
+		num_entries = p_fs->fs_func->count_ext_entries(sb, p_dir, new_entry, ep);
+		if (num_entries < 0)
+			goto out;
+		p_fs->fs_func->delete_dir_entry(sb, p_dir, new_entry, 0, num_entries+1);
+	}
+out:
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return ret;
+} /* end of ffsMoveFile */
+
+/* ffsRemoveFile : remove a file */
+s32 ffsRemoveFile(struct inode *inode, FILE_ID_T *fid)
+{
+	s32 dentry;
+	CHAIN_T dir, clu_to_free;
+	DENTRY_T *ep;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	dir.dir = fid->dir.dir;
+	dir.size = fid->dir.size;
+	dir.flags = fid->dir.flags;
+
+	dentry = fid->entry;
+
+	ep = get_entry_in_dir(sb, &dir, dentry, NULL);
+	if (!ep)
+		return FFS_MEDIAERR;
+
+	if (p_fs->fs_func->get_entry_attr(ep) & ATTR_READONLY)
+		return FFS_PERMISSIONERR;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	/* (1) update the directory entry */
+	remove_file(inode, &dir, dentry);
+
+	clu_to_free.dir = fid->start_clu;
+	clu_to_free.size = (s32)((fid->size-1) >> p_fs->cluster_size_bits) + 1;
+	clu_to_free.flags = fid->flags;
+
+	/* (2) free the clusters */
+	p_fs->fs_func->free_cluster(sb, &clu_to_free, 0);
+
+	fid->size = 0;
+	fid->start_clu = CLUSTER_32(~0);
+	fid->flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsRemoveFile */
+
+/* ffsSetAttr : set the attribute of a given file */
+s32 ffsSetAttr(struct inode *inode, u32 attr)
+{
+	u32 type, sector = 0;
+	DENTRY_T *ep;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+	u8 is_dir = (fid->type == TYPE_DIR) ? 1 : 0;
+	ENTRY_SET_CACHE_T *es = NULL;
+
+	if (fid->attr == attr) {
+		if (p_fs->dev_ejected)
+			return FFS_MEDIAERR;
+		return FFS_SUCCESS;
+	}
+
+	if (is_dir) {
+		if ((fid->dir.dir == p_fs->root_dir) &&
+			(fid->entry == -1)) {
+			if (p_fs->dev_ejected)
+				return FFS_MEDIAERR;
+			return FFS_SUCCESS;
+		}
+	}
+
+	/* get the directory entry of given file */
+	if (p_fs->vol_type == EXFAT) {
+		es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_ALL_ENTRIES, &ep);
+		if (es == NULL)
+			return FFS_MEDIAERR;
+	} else {
+		ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+		if (!ep)
+			return FFS_MEDIAERR;
+	}
+
+	type = p_fs->fs_func->get_entry_type(ep);
+
+	if (((type == TYPE_FILE) && (attr & ATTR_SUBDIR)) ||
+		((type == TYPE_DIR) && (!(attr & ATTR_SUBDIR)))) {
+		s32 err;
+		if (p_fs->dev_ejected)
+			err = FFS_MEDIAERR;
+		else
+			err = FFS_ERROR;
+
+		if (p_fs->vol_type == EXFAT)
+			release_entry_set(es);
+		return err;
+	}
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	/* set the file attribute */
+	fid->attr = attr;
+	p_fs->fs_func->set_entry_attr(ep, attr);
+
+	if (p_fs->vol_type != EXFAT)
+		buf_modify(sb, sector);
+	else {
+		update_dir_checksum_with_entry_set(sb, es);
+		release_entry_set(es);
+	}
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsSetAttr */
+
+/* ffsGetStat : get the information of a given file */
+s32 ffsGetStat(struct inode *inode, DIR_ENTRY_T *info)
+{
+	u32 sector = 0;
+	s32 count;
+	CHAIN_T dir;
+	UNI_NAME_T uni_name;
+	TIMESTAMP_T tm;
+	DENTRY_T *ep, *ep2;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+	ENTRY_SET_CACHE_T *es = NULL;
+	u8 is_dir = (fid->type == TYPE_DIR) ? 1 : 0;
+
+	DPRINTK("ffsGetStat entered\n");
+
+	if (is_dir) {
+		if ((fid->dir.dir == p_fs->root_dir) &&
+			(fid->entry == -1)) {
+			info->Attr = ATTR_SUBDIR;
+			memset((char *) &info->CreateTimestamp, 0, sizeof(DATE_TIME_T));
+			memset((char *) &info->ModifyTimestamp, 0, sizeof(DATE_TIME_T));
+			memset((char *) &info->AccessTimestamp, 0, sizeof(DATE_TIME_T));
+			strcpy(info->ShortName, ".");
+			strcpy(info->Name, ".");
+
+			dir.dir = p_fs->root_dir;
+			dir.flags = 0x01;
+
+			if (p_fs->root_dir == CLUSTER_32(0)) /* FAT16 root_dir */
+				info->Size = p_fs->dentries_in_root << DENTRY_SIZE_BITS;
+			else
+				info->Size = count_num_clusters(sb, &dir) << p_fs->cluster_size_bits;
+
+			count = count_dos_name_entries(sb, &dir, TYPE_DIR);
+			if (count < 0)
+				return FFS_MEDIAERR;
+			info->NumSubdirs = count;
+
+			if (p_fs->dev_ejected)
+				return FFS_MEDIAERR;
+			return FFS_SUCCESS;
+		}
+	}
+
+	/* get the directory entry of given file or directory */
+	if (p_fs->vol_type == EXFAT) {
+		es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_2_ENTRIES, &ep);
+		if (es == NULL)
+			return FFS_MEDIAERR;
+		ep2 = ep+1;
+	} else {
+		ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+		if (!ep)
+			return FFS_MEDIAERR;
+		ep2 = ep;
+		buf_lock(sb, sector);
+	}
+
+	/* set FILE_INFO structure using the acquired DENTRY_T */
+	info->Attr = p_fs->fs_func->get_entry_attr(ep);
+
+	p_fs->fs_func->get_entry_time(ep, &tm, TM_CREATE);
+	info->CreateTimestamp.Year = tm.year;
+	info->CreateTimestamp.Month = tm.mon;
+	info->CreateTimestamp.Day = tm.day;
+	info->CreateTimestamp.Hour = tm.hour;
+	info->CreateTimestamp.Minute = tm.min;
+	info->CreateTimestamp.Second = tm.sec;
+	info->CreateTimestamp.MilliSecond = 0;
+
+	p_fs->fs_func->get_entry_time(ep, &tm, TM_MODIFY);
+	info->ModifyTimestamp.Year = tm.year;
+	info->ModifyTimestamp.Month = tm.mon;
+	info->ModifyTimestamp.Day = tm.day;
+	info->ModifyTimestamp.Hour = tm.hour;
+	info->ModifyTimestamp.Minute = tm.min;
+	info->ModifyTimestamp.Second = tm.sec;
+	info->ModifyTimestamp.MilliSecond = 0;
+
+	memset((char *) &info->AccessTimestamp, 0, sizeof(DATE_TIME_T));
+
+	*(uni_name.name) = 0x0;
+	/* XXX this is very bad for exfat cuz name is already included in es.
+	 API should be revised */
+	p_fs->fs_func->get_uni_name_from_ext_entry(sb, &(fid->dir), fid->entry, uni_name.name);
+	if (*(uni_name.name) == 0x0 && p_fs->vol_type != EXFAT)
+		get_uni_name_from_dos_entry(sb, (DOS_DENTRY_T *) ep, &uni_name, 0x1);
+	nls_uniname_to_cstring(sb, info->Name, &uni_name);
+
+	if (p_fs->vol_type == EXFAT) {
+		info->NumSubdirs = 2;
+	} else {
+		buf_unlock(sb, sector);
+		get_uni_name_from_dos_entry(sb, (DOS_DENTRY_T *) ep, &uni_name, 0x0);
+		nls_uniname_to_cstring(sb, info->ShortName, &uni_name);
+		info->NumSubdirs = 0;
+	}
+
+	info->Size = p_fs->fs_func->get_entry_size(ep2);
+
+	if (p_fs->vol_type == EXFAT)
+		release_entry_set(es);
+
+	if (is_dir) {
+		dir.dir = fid->start_clu;
+		dir.flags = 0x01;
+
+		if (info->Size == 0)
+			info->Size = (u64) count_num_clusters(sb, &dir) << p_fs->cluster_size_bits;
+
+		count = count_dos_name_entries(sb, &dir, TYPE_DIR);
+		if (count < 0)
+			return FFS_MEDIAERR;
+		info->NumSubdirs += count;
+	}
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	DPRINTK("ffsGetStat exited successfully\n");
+	return FFS_SUCCESS;
+} /* end of ffsGetStat */
+
+/* ffsSetStat : set the information of a given file */
+s32 ffsSetStat(struct inode *inode, DIR_ENTRY_T *info)
+{
+	u32 sector = 0;
+	TIMESTAMP_T tm;
+	DENTRY_T *ep, *ep2;
+	ENTRY_SET_CACHE_T *es = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+	u8 is_dir = (fid->type == TYPE_DIR) ? 1 : 0;
+
+	if (is_dir) {
+		if ((fid->dir.dir == p_fs->root_dir) &&
+			(fid->entry == -1)) {
+			if (p_fs->dev_ejected)
+				return FFS_MEDIAERR;
+			return FFS_SUCCESS;
+		}
+	}
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	/* get the directory entry of given file or directory */
+	if (p_fs->vol_type == EXFAT) {
+		es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_ALL_ENTRIES, &ep);
+		if (es == NULL)
+			return FFS_MEDIAERR;
+		ep2 = ep+1;
+	} else {
+		/* for other than exfat */
+		ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+		if (!ep)
+			return FFS_MEDIAERR;
+		ep2 = ep;
+	}
+
+
+	p_fs->fs_func->set_entry_attr(ep, info->Attr);
+
+	/* set FILE_INFO structure using the acquired DENTRY_T */
+	tm.sec  = info->CreateTimestamp.Second;
+	tm.min  = info->CreateTimestamp.Minute;
+	tm.hour = info->CreateTimestamp.Hour;
+	tm.day  = info->CreateTimestamp.Day;
+	tm.mon  = info->CreateTimestamp.Month;
+	tm.year = info->CreateTimestamp.Year;
+	p_fs->fs_func->set_entry_time(ep, &tm, TM_CREATE);
+
+	tm.sec  = info->ModifyTimestamp.Second;
+	tm.min  = info->ModifyTimestamp.Minute;
+	tm.hour = info->ModifyTimestamp.Hour;
+	tm.day  = info->ModifyTimestamp.Day;
+	tm.mon  = info->ModifyTimestamp.Month;
+	tm.year = info->ModifyTimestamp.Year;
+	p_fs->fs_func->set_entry_time(ep, &tm, TM_MODIFY);
+
+
+	p_fs->fs_func->set_entry_size(ep2, info->Size);
+
+	if (p_fs->vol_type != EXFAT) {
+		buf_modify(sb, sector);
+	} else {
+		update_dir_checksum_with_entry_set(sb, es);
+		release_entry_set(es);
+	}
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsSetStat */
+
+s32 ffsMapCluster(struct inode *inode, s32 clu_offset, u32 *clu)
+{
+	s32 num_clusters, num_alloced, modified = FALSE;
+	u32 last_clu, sector = 0;
+	CHAIN_T new_clu;
+	DENTRY_T *ep;
+	ENTRY_SET_CACHE_T *es = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+
+	fid->rwoffset = (s64)(clu_offset) << p_fs->cluster_size_bits;
+
+	if (EXFAT_I(inode)->mmu_private == 0)
+		num_clusters = 0;
+	else
+		num_clusters = (s32)((EXFAT_I(inode)->mmu_private-1) >> p_fs->cluster_size_bits) + 1;
+
+	*clu = last_clu = fid->start_clu;
+
+	if (fid->flags == 0x03) {
+		if ((clu_offset > 0) && (*clu != CLUSTER_32(~0))) {
+			last_clu += clu_offset - 1;
+
+			if (clu_offset == num_clusters)
+				*clu = CLUSTER_32(~0);
+			else
+				*clu += clu_offset;
+		}
+	} else {
+		/* hint information */
+		if ((clu_offset > 0) && (fid->hint_last_off > 0) &&
+			(clu_offset >= fid->hint_last_off)) {
+			clu_offset -= fid->hint_last_off;
+			*clu = fid->hint_last_clu;
+		}
+
+		while ((clu_offset > 0) && (*clu != CLUSTER_32(~0))) {
+			last_clu = *clu;
+			if (FAT_read(sb, *clu, clu) == -1)
+				return FFS_MEDIAERR;
+			clu_offset--;
+		}
+	}
+
+	if (*clu == CLUSTER_32(~0)) {
+		fs_set_vol_flags(sb, VOL_DIRTY);
+
+		new_clu.dir = (last_clu == CLUSTER_32(~0)) ? CLUSTER_32(~0) : last_clu+1;
+		new_clu.size = 0;
+		new_clu.flags = fid->flags;
+
+		/* (1) allocate a cluster */
+		num_alloced = p_fs->fs_func->alloc_cluster(sb, 1, &new_clu);
+		if (num_alloced < 0)
+			return FFS_MEDIAERR;
+		else if (num_alloced == 0)
+			return FFS_FULL;
+
+		/* (2) append to the FAT chain */
+		if (last_clu == CLUSTER_32(~0)) {
+			if (new_clu.flags == 0x01)
+				fid->flags = 0x01;
+			fid->start_clu = new_clu.dir;
+			modified = TRUE;
+		} else {
+			if (new_clu.flags != fid->flags) {
+				exfat_chain_cont_cluster(sb, fid->start_clu, num_clusters);
+				fid->flags = 0x01;
+				modified = TRUE;
+			}
+			if (new_clu.flags == 0x01)
+				FAT_write(sb, last_clu, new_clu.dir);
+		}
+
+		num_clusters += num_alloced;
+		*clu = new_clu.dir;
+
+		if (p_fs->vol_type == EXFAT) {
+			es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_ALL_ENTRIES, &ep);
+			if (es == NULL)
+				return FFS_MEDIAERR;
+			/* get stream entry */
+			ep++;
+		}
+
+		/* (3) update directory entry */
+		if (modified) {
+			if (p_fs->vol_type != EXFAT) {
+				ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+				if (!ep)
+					return FFS_MEDIAERR;
+			}
+
+			if (p_fs->fs_func->get_entry_flag(ep) != fid->flags)
+				p_fs->fs_func->set_entry_flag(ep, fid->flags);
+
+			if (p_fs->fs_func->get_entry_clu0(ep) != fid->start_clu)
+				p_fs->fs_func->set_entry_clu0(ep, fid->start_clu);
+
+			if (p_fs->vol_type != EXFAT)
+				buf_modify(sb, sector);
+		}
+
+		if (p_fs->vol_type == EXFAT) {
+			update_dir_checksum_with_entry_set(sb, es);
+			release_entry_set(es);
+		}
+
+		/* add number of new blocks to inode */
+		inode->i_blocks += num_alloced << (p_fs->cluster_size_bits - 9);
+	}
+
+	/* hint information */
+	fid->hint_last_off = (s32)(fid->rwoffset >> p_fs->cluster_size_bits);
+	fid->hint_last_clu = *clu;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsMapCluster */
+
+/*----------------------------------------------------------------------*/
+/*  Directory Operation Functions                                       */
+/*----------------------------------------------------------------------*/
+
+/* ffsCreateDir : create(make) a directory */
+s32 ffsCreateDir(struct inode *inode, char *path, FILE_ID_T *fid)
+{
+	s32 ret/*, dentry*/;
+	CHAIN_T dir;
+	UNI_NAME_T uni_name;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	DPRINTK("ffsCreateDir entered\n");
+
+	/* check the validity of directory name in the given old pathname */
+	ret = resolve_path(inode, path, &dir, &uni_name);
+	if (ret)
+		return ret;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	ret = create_dir(inode, &dir, &uni_name, fid);
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return ret;
+} /* end of ffsCreateDir */
+
+/* ffsReadDir : read a directory entry from the opened directory */
+s32 ffsReadDir(struct inode *inode, DIR_ENTRY_T *dir_entry)
+{
+	int i, dentry, clu_offset;
+	s32 dentries_per_clu, dentries_per_clu_bits = 0;
+	u32 type, sector;
+	CHAIN_T dir, clu;
+	UNI_NAME_T uni_name;
+	TIMESTAMP_T tm;
+	DENTRY_T *ep;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+
+	/* check if the given file ID is opened */
+	if (fid->type != TYPE_DIR)
+		return FFS_PERMISSIONERR;
+
+	if (fid->entry == -1) {
+		dir.dir = p_fs->root_dir;
+		dir.flags = 0x01;
+	} else {
+		dir.dir = fid->start_clu;
+		dir.size = (s32)(fid->size >> p_fs->cluster_size_bits);
+		dir.flags = fid->flags;
+	}
+
+	dentry = (s32) fid->rwoffset;
+
+	if (dir.dir == CLUSTER_32(0)) { /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+
+		if (dentry == dentries_per_clu) {
+			clu.dir = CLUSTER_32(~0);
+		} else {
+			clu.dir = dir.dir;
+			clu.size = dir.size;
+			clu.flags = dir.flags;
+		}
+	} else {
+		dentries_per_clu = p_fs->dentries_per_clu;
+		dentries_per_clu_bits = ilog2(dentries_per_clu);
+
+		clu_offset = dentry >> dentries_per_clu_bits;
+		clu.dir = dir.dir;
+		clu.size = dir.size;
+		clu.flags = dir.flags;
+
+		if (clu.flags == 0x03) {
+			clu.dir += clu_offset;
+			clu.size -= clu_offset;
+		} else {
+			/* hint_information */
+			if ((clu_offset > 0) && (fid->hint_last_off > 0) &&
+				(clu_offset >= fid->hint_last_off)) {
+				clu_offset -= fid->hint_last_off;
+				clu.dir = fid->hint_last_clu;
+			}
+
+			while (clu_offset > 0) {
+				/* clu.dir = FAT_read(sb, clu.dir); */
+				if (FAT_read(sb, clu.dir, &(clu.dir)) == -1)
+					return FFS_MEDIAERR;
+
+				clu_offset--;
+			}
+		}
+	}
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		if (dir.dir == CLUSTER_32(0)) /* FAT16 root_dir */
+			i = dentry % dentries_per_clu;
+		else
+			i = dentry & (dentries_per_clu-1);
+
+		for ( ; i < dentries_per_clu; i++, dentry++) {
+			ep = get_entry_in_dir(sb, &clu, i, &sector);
+			if (!ep)
+				return FFS_MEDIAERR;
+
+			type = p_fs->fs_func->get_entry_type(ep);
+
+			if (type == TYPE_UNUSED)
+				break;
+
+			if ((type != TYPE_FILE) && (type != TYPE_DIR))
+				continue;
+
+			buf_lock(sb, sector);
+			dir_entry->Attr = p_fs->fs_func->get_entry_attr(ep);
+
+			p_fs->fs_func->get_entry_time(ep, &tm, TM_CREATE);
+			dir_entry->CreateTimestamp.Year = tm.year;
+			dir_entry->CreateTimestamp.Month = tm.mon;
+			dir_entry->CreateTimestamp.Day = tm.day;
+			dir_entry->CreateTimestamp.Hour = tm.hour;
+			dir_entry->CreateTimestamp.Minute = tm.min;
+			dir_entry->CreateTimestamp.Second = tm.sec;
+			dir_entry->CreateTimestamp.MilliSecond = 0;
+
+			p_fs->fs_func->get_entry_time(ep, &tm, TM_MODIFY);
+			dir_entry->ModifyTimestamp.Year = tm.year;
+			dir_entry->ModifyTimestamp.Month = tm.mon;
+			dir_entry->ModifyTimestamp.Day = tm.day;
+			dir_entry->ModifyTimestamp.Hour = tm.hour;
+			dir_entry->ModifyTimestamp.Minute = tm.min;
+			dir_entry->ModifyTimestamp.Second = tm.sec;
+			dir_entry->ModifyTimestamp.MilliSecond = 0;
+
+			memset((char *) &dir_entry->AccessTimestamp, 0, sizeof(DATE_TIME_T));
+
+			*(uni_name.name) = 0x0;
+			p_fs->fs_func->get_uni_name_from_ext_entry(sb, &dir, dentry, uni_name.name);
+			if (*(uni_name.name) == 0x0 && p_fs->vol_type != EXFAT)
+				get_uni_name_from_dos_entry(sb, (DOS_DENTRY_T *) ep, &uni_name, 0x1);
+			nls_uniname_to_cstring(sb, dir_entry->Name, &uni_name);
+			buf_unlock(sb, sector);
+
+			if (p_fs->vol_type == EXFAT) {
+				ep = get_entry_in_dir(sb, &clu, i+1, NULL);
+				if (!ep)
+					return FFS_MEDIAERR;
+			} else {
+				get_uni_name_from_dos_entry(sb, (DOS_DENTRY_T *) ep, &uni_name, 0x0);
+				nls_uniname_to_cstring(sb, dir_entry->ShortName, &uni_name);
+			}
+
+			dir_entry->Size = p_fs->fs_func->get_entry_size(ep);
+
+			/* hint information */
+			if (dir.dir == CLUSTER_32(0)) { /* FAT16 root_dir */
+			} else {
+				fid->hint_last_off = dentry >> dentries_per_clu_bits;
+				fid->hint_last_clu = clu.dir;
+			}
+
+			fid->rwoffset = (s64) ++dentry;
+
+			if (p_fs->dev_ejected)
+				return FFS_MEDIAERR;
+
+			return FFS_SUCCESS;
+		}
+
+		if (dir.dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (clu.flags == 0x03) {
+			if ((--clu.size) > 0)
+				clu.dir++;
+			else
+				clu.dir = CLUSTER_32(~0);
+		} else {
+			/* clu.dir = FAT_read(sb, clu.dir); */
+			if (FAT_read(sb, clu.dir, &(clu.dir)) == -1)
+				return FFS_MEDIAERR;
+		}
+	}
+
+	*(dir_entry->Name) = '\0';
+
+	fid->rwoffset = (s64) ++dentry;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsReadDir */
+
+/* ffsRemoveDir : remove a directory */
+s32 ffsRemoveDir(struct inode *inode, FILE_ID_T *fid)
+{
+	s32 dentry;
+	CHAIN_T dir, clu_to_free;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	dir.dir = fid->dir.dir;
+	dir.size = fid->dir.size;
+	dir.flags = fid->dir.flags;
+
+	dentry = fid->entry;
+
+	/* check if the file is "." or ".." */
+	if (p_fs->vol_type != EXFAT) {
+		if ((dir.dir != p_fs->root_dir) && (dentry < 2))
+			return FFS_PERMISSIONERR;
+	}
+
+	clu_to_free.dir = fid->start_clu;
+	clu_to_free.size = (s32)((fid->size-1) >> p_fs->cluster_size_bits) + 1;
+	clu_to_free.flags = fid->flags;
+
+	if (!is_dir_empty(sb, &clu_to_free))
+		return FFS_FILEEXIST;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	/* (1) update the directory entry */
+	remove_file(inode, &dir, dentry);
+
+	/* (2) free the clusters */
+	p_fs->fs_func->free_cluster(sb, &clu_to_free, 1);
+
+	fid->size = 0;
+	fid->start_clu = CLUSTER_32(~0);
+	fid->flags = (p_fs->vol_type == EXFAT)? 0x03: 0x01;
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsRemoveDir */
+
+/*======================================================================*/
+/*  Local Function Definitions                                          */
+/*======================================================================*/
+
+/*
+ *  File System Management Functions
+ */
+
+s32 fs_init(void)
+{
+	/* critical check for system requirement on size of DENTRY_T structure */
+	if (sizeof(DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(DOS_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(EXT_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(FILE_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(STRM_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(NAME_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(BMAP_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(CASE_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(VOLM_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	return FFS_SUCCESS;
+} /* end of fs_init */
+
+s32 fs_shutdown(void)
+{
+	return FFS_SUCCESS;
+} /* end of fs_shutdown */
+
+void fs_set_vol_flags(struct super_block *sb, u32 new_flag)
+{
+	PBR_SECTOR_T *p_pbr;
+	BPBEX_T *p_bpb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_fs->vol_flag == new_flag)
+		return;
+
+	p_fs->vol_flag = new_flag;
+
+	if (p_fs->vol_type == EXFAT) {
+		if (p_fs->pbr_bh == NULL) {
+			if (sector_read(sb, p_fs->PBR_sector, &(p_fs->pbr_bh), 1) != FFS_SUCCESS)
+				return;
+		}
+
+		p_pbr = (PBR_SECTOR_T *) p_fs->pbr_bh->b_data;
+		p_bpb = (BPBEX_T *) p_pbr->bpb;
+		SET16(p_bpb->vol_flags, (u16) new_flag);
+
+		/* XXX duyoung
+		 what can we do here? (cuz fs_set_vol_flags() is void) */
+		if ((new_flag == VOL_DIRTY) && (!buffer_dirty(p_fs->pbr_bh)))
+			sector_write(sb, p_fs->PBR_sector, p_fs->pbr_bh, 1);
+		else
+			sector_write(sb, p_fs->PBR_sector, p_fs->pbr_bh, 0);
+	}
+} /* end of fs_set_vol_flags */
+
+void fs_sync(struct super_block *sb, s32 do_sync)
+{
+	if (do_sync)
+		bdev_sync(sb);
+} /* end of fs_sync */
+
+void fs_error(struct super_block *sb)
+{
+	struct exfat_mount_options *opts = &EXFAT_SB(sb)->options;
+
+	if (opts->errors == EXFAT_ERRORS_PANIC)
+		panic("[EXFAT] Filesystem panic from previous error\n");
+	else if ((opts->errors == EXFAT_ERRORS_RO) && !(sb->s_flags & MS_RDONLY)) {
+		sb->s_flags |= MS_RDONLY;
+		printk(KERN_ERR "[EXFAT] Filesystem has been set read-only\n");
+	}
+}
+
+/*
+ *  Cluster Management Functions
+ */
+
+s32 clear_cluster(struct super_block *sb, u32 clu)
+{
+	u32 s, n;
+	s32 ret = FFS_SUCCESS;
+	struct buffer_head *tmp_bh = NULL;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (clu == CLUSTER_32(0)) { /* FAT16 root_dir */
+		s = p_fs->root_start_sector;
+		n = p_fs->data_start_sector;
+	} else {
+		s = START_SECTOR(clu);
+		n = s + p_fs->sectors_per_clu;
+	}
+
+	for (; s < n; s++) {
+		ret = sector_read(sb, s, &tmp_bh, 0);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		memset((char *) tmp_bh->b_data, 0x0, p_bd->sector_size);
+		ret = sector_write(sb, s, tmp_bh, 0);
+		if (ret != FFS_SUCCESS)
+			break;
+	}
+
+	brelse(tmp_bh);
+	return ret;
+} /* end of clear_cluster */
+
+s32 fat_alloc_cluster(struct super_block *sb, s32 num_alloc, CHAIN_T *p_chain)
+{
+	int i, num_clusters = 0;
+	u32 new_clu, last_clu = CLUSTER_32(~0), read_clu;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	new_clu = p_chain->dir;
+	if (new_clu == CLUSTER_32(~0))
+		new_clu = p_fs->clu_srch_ptr;
+	else if (new_clu >= p_fs->num_clusters)
+		new_clu = 2;
+
+	__set_sb_dirty(sb);
+
+	p_chain->dir = CLUSTER_32(~0);
+
+	for (i = 2; i < p_fs->num_clusters; i++) {
+		if (FAT_read(sb, new_clu, &read_clu) != 0)
+			return -1;
+
+		if (read_clu == CLUSTER_32(0)) {
+			if (FAT_write(sb, new_clu, CLUSTER_32(~0)) < 0)
+				return -1;
+			num_clusters++;
+
+			if (p_chain->dir == CLUSTER_32(~0))
+				p_chain->dir = new_clu;
+			else {
+				if (FAT_write(sb, last_clu, new_clu) < 0)
+					return -1;
+			}
+
+			last_clu = new_clu;
+
+			if ((--num_alloc) == 0) {
+				p_fs->clu_srch_ptr = new_clu;
+				if (p_fs->used_clusters != (u32) ~0)
+					p_fs->used_clusters += num_clusters;
+
+				return num_clusters;
+			}
+		}
+		if ((++new_clu) >= p_fs->num_clusters)
+			new_clu = 2;
+	}
+
+	p_fs->clu_srch_ptr = new_clu;
+	if (p_fs->used_clusters != (u32) ~0)
+		p_fs->used_clusters += num_clusters;
+
+	return num_clusters;
+} /* end of fat_alloc_cluster */
+
+s32 exfat_alloc_cluster(struct super_block *sb, s32 num_alloc, CHAIN_T *p_chain)
+{
+	s32 num_clusters = 0;
+	u32 hint_clu, new_clu, last_clu = CLUSTER_32(~0);
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	hint_clu = p_chain->dir;
+	if (hint_clu == CLUSTER_32(~0)) {
+		hint_clu = test_alloc_bitmap(sb, p_fs->clu_srch_ptr-2);
+		if (hint_clu == CLUSTER_32(~0))
+			return 0;
+	} else if (hint_clu >= p_fs->num_clusters) {
+		hint_clu = 2;
+		p_chain->flags = 0x01;
+	}
+
+	__set_sb_dirty(sb);
+
+	p_chain->dir = CLUSTER_32(~0);
+
+	while ((new_clu = test_alloc_bitmap(sb, hint_clu-2)) != CLUSTER_32(~0)) {
+		if (new_clu != hint_clu) {
+			if (p_chain->flags == 0x03) {
+				exfat_chain_cont_cluster(sb, p_chain->dir, num_clusters);
+				p_chain->flags = 0x01;
+			}
+		}
+
+		if (set_alloc_bitmap(sb, new_clu-2) != FFS_SUCCESS)
+			return -1;
+
+		num_clusters++;
+
+		if (p_chain->flags == 0x01) {
+			if (FAT_write(sb, new_clu, CLUSTER_32(~0)) < 0)
+				return -1;
+		}
+
+		if (p_chain->dir == CLUSTER_32(~0)) {
+			p_chain->dir = new_clu;
+		} else {
+			if (p_chain->flags == 0x01) {
+				if (FAT_write(sb, last_clu, new_clu) < 0)
+					return -1;
+			}
+		}
+		last_clu = new_clu;
+
+		if ((--num_alloc) == 0) {
+			p_fs->clu_srch_ptr = hint_clu;
+			if (p_fs->used_clusters != (u32) ~0)
+				p_fs->used_clusters += num_clusters;
+
+			p_chain->size += num_clusters;
+			return num_clusters;
+		}
+
+		hint_clu = new_clu + 1;
+		if (hint_clu >= p_fs->num_clusters) {
+			hint_clu = 2;
+
+			if (p_chain->flags == 0x03) {
+				exfat_chain_cont_cluster(sb, p_chain->dir, num_clusters);
+				p_chain->flags = 0x01;
+			}
+		}
+	}
+
+	p_fs->clu_srch_ptr = hint_clu;
+	if (p_fs->used_clusters != (u32) ~0)
+		p_fs->used_clusters += num_clusters;
+
+	p_chain->size += num_clusters;
+	return num_clusters;
+} /* end of exfat_alloc_cluster */
+
+void fat_free_cluster(struct super_block *sb, CHAIN_T *p_chain, s32 do_relse)
+{
+	s32 num_clusters = 0;
+	u32 clu, prev;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	int i;
+	u32 sector;
+
+	if ((p_chain->dir == CLUSTER_32(0)) || (p_chain->dir == CLUSTER_32(~0)))
+		return;
+	__set_sb_dirty(sb);
+	clu = p_chain->dir;
+
+	if (p_chain->size <= 0)
+		return;
+
+	do {
+		if (p_fs->dev_ejected)
+			break;
+
+		if (do_relse) {
+			sector = START_SECTOR(clu);
+			for (i = 0; i < p_fs->sectors_per_clu; i++)
+				buf_release(sb, sector+i);
+		}
+
+		prev = clu;
+		if (FAT_read(sb, clu, &clu) == -1)
+			break;
+
+		if (FAT_write(sb, prev, CLUSTER_32(0)) < 0)
+			break;
+		num_clusters++;
+
+	} while (clu != CLUSTER_32(~0));
+
+	if (p_fs->used_clusters != (u32) ~0)
+		p_fs->used_clusters -= num_clusters;
+} /* end of fat_free_cluster */
+
+void exfat_free_cluster(struct super_block *sb, CHAIN_T *p_chain, s32 do_relse)
+{
+	s32 num_clusters = 0;
+	u32 clu;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	int i;
+	u32 sector;
+
+	if ((p_chain->dir == CLUSTER_32(0)) || (p_chain->dir == CLUSTER_32(~0)))
+		return;
+
+	if (p_chain->size <= 0) {
+		printk(KERN_ERR "[EXFAT] free_cluster : skip free-req clu:%u, "
+				"because of zero-size truncation\n"
+				,p_chain->dir);
+		return;
+	}
+
+	__set_sb_dirty(sb);
+	clu = p_chain->dir;
+
+	if (p_chain->flags == 0x03) {
+		do {
+			if (do_relse) {
+				sector = START_SECTOR(clu);
+				for (i = 0; i < p_fs->sectors_per_clu; i++)
+					buf_release(sb, sector+i);
+			}
+
+			if (clr_alloc_bitmap(sb, clu-2) != FFS_SUCCESS)
+				break;
+			clu++;
+
+			num_clusters++;
+		} while (num_clusters < p_chain->size);
+	} else {
+		do {
+			if (p_fs->dev_ejected)
+				break;
+
+			if (do_relse) {
+				sector = START_SECTOR(clu);
+				for (i = 0; i < p_fs->sectors_per_clu; i++)
+					buf_release(sb, sector+i);
+			}
+
+			if (clr_alloc_bitmap(sb, clu-2) != FFS_SUCCESS)
+				break;
+
+			if (FAT_read(sb, clu, &clu) == -1)
+				break;
+			num_clusters++;
+		} while ((clu != CLUSTER_32(0)) && (clu != CLUSTER_32(~0)));
+	}
+
+	if (p_fs->used_clusters != (u32) ~0)
+		p_fs->used_clusters -= num_clusters;
+} /* end of exfat_free_cluster */
+
+u32 find_last_cluster(struct super_block *sb, CHAIN_T *p_chain)
+{
+	u32 clu, next;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	clu = p_chain->dir;
+
+	if (p_chain->flags == 0x03) {
+		clu += p_chain->size - 1;
+	} else {
+		while ((FAT_read(sb, clu, &next) == 0) && (next != CLUSTER_32(~0))) {
+			if (p_fs->dev_ejected)
+				break;
+			clu = next;
+		}
+	}
+
+	return clu;
+} /* end of find_last_cluster */
+
+s32 count_num_clusters(struct super_block *sb, CHAIN_T *p_chain)
+{
+	int i, count = 0;
+	u32 clu;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if ((p_chain->dir == CLUSTER_32(0)) || (p_chain->dir == CLUSTER_32(~0)))
+		return 0;
+
+	clu = p_chain->dir;
+
+	if (p_chain->flags == 0x03) {
+		count = p_chain->size;
+	} else {
+		for (i = 2; i < p_fs->num_clusters; i++) {
+			count++;
+			if (FAT_read(sb, clu, &clu) != 0)
+				return 0;
+			if (clu == CLUSTER_32(~0))
+				break;
+		}
+	}
+
+	return count;
+} /* end of count_num_clusters */
+
+s32 fat_count_used_clusters(struct super_block *sb)
+{
+	int i, count = 0;
+	u32 clu;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	for (i = 2; i < p_fs->num_clusters; i++) {
+		if (FAT_read(sb, i, &clu) != 0)
+			break;
+		if (clu != CLUSTER_32(0))
+			count++;
+	}
+
+	return count;
+} /* end of fat_count_used_clusters */
+
+s32 exfat_count_used_clusters(struct super_block *sb)
+{
+	int i, map_i, map_b, count = 0;
+	u8 k;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	map_i = map_b = 0;
+
+	for (i = 2; i < p_fs->num_clusters; i += 8) {
+		k = *(((u8 *) p_fs->vol_amap[map_i]->b_data) + map_b);
+		count += used_bit[k];
+
+		if ((++map_b) >= p_bd->sector_size) {
+			map_i++;
+			map_b = 0;
+		}
+	}
+
+	return count;
+} /* end of exfat_count_used_clusters */
+
+void exfat_chain_cont_cluster(struct super_block *sb, u32 chain, s32 len)
+{
+	if (len == 0)
+		return;
+
+	while (len > 1) {
+		if (FAT_write(sb, chain, chain+1) < 0)
+			break;
+		chain++;
+		len--;
+	}
+	FAT_write(sb, chain, CLUSTER_32(~0));
+} /* end of exfat_chain_cont_cluster */
+
+/*
+ *  Allocation Bitmap Management Functions
+ */
+
+s32 load_alloc_bitmap(struct super_block *sb)
+{
+	int i, j, ret;
+	u32 map_size;
+	u32 type, sector;
+	CHAIN_T clu;
+	BMAP_DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	clu.dir = p_fs->root_dir;
+	clu.flags = 0x01;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < p_fs->dentries_per_clu; i++) {
+			ep = (BMAP_DENTRY_T *) get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return FFS_MEDIAERR;
+
+			type = p_fs->fs_func->get_entry_type((DENTRY_T *) ep);
+
+			if (type == TYPE_UNUSED)
+				break;
+			if (type != TYPE_BITMAP)
+				continue;
+
+			if (ep->flags == 0x0) {
+				p_fs->map_clu  = GET32_A(ep->start_clu);
+				map_size = (u32) GET64_A(ep->size);
+
+				p_fs->map_sectors = ((map_size-1) >> p_bd->sector_size_bits) + 1;
+
+				p_fs->vol_amap = (struct buffer_head **) kmalloc(sizeof(struct buffer_head *) * p_fs->map_sectors, GFP_KERNEL);
+				if (p_fs->vol_amap == NULL)
+					return FFS_MEMORYERR;
+
+				sector = START_SECTOR(p_fs->map_clu);
+
+				for (j = 0; j < p_fs->map_sectors; j++) {
+					p_fs->vol_amap[j] = NULL;
+					ret = sector_read(sb, sector+j, &(p_fs->vol_amap[j]), 1);
+					if (ret != FFS_SUCCESS) {
+						/*  release all buffers and free vol_amap */
+						i = 0;
+						while (i < j)
+							brelse(p_fs->vol_amap[i++]);
+
+						if (p_fs->vol_amap)
+							kfree(p_fs->vol_amap);
+						p_fs->vol_amap = NULL;
+						return ret;
+					}
+				}
+
+				p_fs->pbr_bh = NULL;
+				return FFS_SUCCESS;
+			}
+		}
+
+		if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+			return FFS_MEDIAERR;
+	}
+
+	return FFS_FORMATERR;
+} /* end of load_alloc_bitmap */
+
+void free_alloc_bitmap(struct super_block *sb)
+{
+	int i;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	brelse(p_fs->pbr_bh);
+
+	for (i = 0; i < p_fs->map_sectors; i++)
+		__brelse(p_fs->vol_amap[i]);
+
+	if (p_fs->vol_amap)
+		kfree(p_fs->vol_amap);
+	p_fs->vol_amap = NULL;
+} /* end of free_alloc_bitmap */
+
+s32 set_alloc_bitmap(struct super_block *sb, u32 clu)
+{
+	int i, b;
+	u32 sector;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	i = clu >> (p_bd->sector_size_bits + 3);
+	b = clu & ((p_bd->sector_size << 3) - 1);
+
+	sector = START_SECTOR(p_fs->map_clu) + i;
+
+	exfat_bitmap_set((u8 *) p_fs->vol_amap[i]->b_data, b);
+
+	return sector_write(sb, sector, p_fs->vol_amap[i], 0);
+} /* end of set_alloc_bitmap */
+
+s32 clr_alloc_bitmap(struct super_block *sb, u32 clu)
+{
+	int i, b;
+	u32 sector;
+#ifdef CONFIG_EXFAT_DISCARD
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	struct exfat_mount_options *opts = &sbi->options;
+	int ret;
+#endif /* CONFIG_EXFAT_DISCARD */
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	i = clu >> (p_bd->sector_size_bits + 3);
+	b = clu & ((p_bd->sector_size << 3) - 1);
+
+	sector = START_SECTOR(p_fs->map_clu) + i;
+
+	exfat_bitmap_clear((u8 *) p_fs->vol_amap[i]->b_data, b);
+
+	return sector_write(sb, sector, p_fs->vol_amap[i], 0);
+
+#ifdef CONFIG_EXFAT_DISCARD
+	if (opts->discard) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+		ret = sb_issue_discard(sb, START_SECTOR(clu), (1 << p_fs->sectors_per_clu_bits));
+#else
+		ret = sb_issue_discard(sb, START_SECTOR(clu), (1 << p_fs->sectors_per_clu_bits), GFP_NOFS, 0);
+#endif
+		if (ret == -EOPNOTSUPP) {
+			printk(KERN_WARNING "discard not supported by device, disabling");
+			opts->discard = 0;
+		}
+	}
+#endif /* CONFIG_EXFAT_DISCARD */
+} /* end of clr_alloc_bitmap */
+
+u32 test_alloc_bitmap(struct super_block *sb, u32 clu)
+{
+	int i, map_i, map_b;
+	u32 clu_base, clu_free;
+	u8 k, clu_mask;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	clu_base = (clu & ~(0x7)) + 2;
+	clu_mask = (1 << (clu - clu_base + 2)) - 1;
+
+	map_i = clu >> (p_bd->sector_size_bits + 3);
+	map_b = (clu >> 3) & p_bd->sector_size_mask;
+
+	for (i = 2; i < p_fs->num_clusters; i += 8) {
+		k = *(((u8 *) p_fs->vol_amap[map_i]->b_data) + map_b);
+		if (clu_mask > 0) {
+			k |= clu_mask;
+			clu_mask = 0;
+		}
+		if (k < 0xFF) {
+			clu_free = clu_base + free_bit[k];
+			if (clu_free < p_fs->num_clusters)
+				return clu_free;
+		}
+		clu_base += 8;
+
+		if (((++map_b) >= p_bd->sector_size) || (clu_base >= p_fs->num_clusters)) {
+			if ((++map_i) >= p_fs->map_sectors) {
+				clu_base = 2;
+				map_i = 0;
+			}
+			map_b = 0;
+		}
+	}
+
+	return CLUSTER_32(~0);
+} /* end of test_alloc_bitmap */
+
+void sync_alloc_bitmap(struct super_block *sb)
+{
+	int i;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_fs->vol_amap == NULL)
+		return;
+
+	for (i = 0; i < p_fs->map_sectors; i++)
+		sync_dirty_buffer(p_fs->vol_amap[i]);
+} /* end of sync_alloc_bitmap */
+
+/*
+ *  Upcase table Management Functions
+ */
+s32 __load_upcase_table(struct super_block *sb, u32 sector, u32 num_sectors, u32 utbl_checksum)
+{
+	int i, ret = FFS_ERROR;
+	u32 j;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	struct buffer_head *tmp_bh = NULL;
+
+	u8	skip = FALSE;
+	u32	index = 0;
+	u16	uni = 0;
+	u16 **upcase_table;
+
+	u32 checksum = 0;
+
+	upcase_table = p_fs->vol_utbl = (u16 **) kmalloc(UTBL_COL_COUNT * sizeof(u16 *), GFP_KERNEL);
+	if (upcase_table == NULL)
+		return FFS_MEMORYERR;
+	memset(upcase_table, 0, UTBL_COL_COUNT * sizeof(u16 *));
+
+	num_sectors += sector;
+
+	while (sector < num_sectors) {
+		ret = sector_read(sb, sector, &tmp_bh, 1);
+		if (ret != FFS_SUCCESS) {
+			DPRINTK("sector read (0x%X)fail\n", sector);
+			goto error;
+		}
+		sector++;
+
+		for (i = 0; i < p_bd->sector_size && index <= 0xFFFF; i += 2) {
+			uni = GET16(((u8 *) tmp_bh->b_data)+i);
+
+			checksum = ((checksum & 1) ? 0x80000000 : 0) + (checksum >> 1) + *(((u8 *) tmp_bh->b_data)+i);
+			checksum = ((checksum & 1) ? 0x80000000 : 0) + (checksum >> 1) + *(((u8 *) tmp_bh->b_data)+(i+1));
+
+			if (skip) {
+				DPRINTK("skip from 0x%X ", index);
+				index += uni;
+				DPRINTK("to 0x%X (amount of 0x%X)\n", index, uni);
+				skip = FALSE;
+			} else if (uni == index)
+				index++;
+			else if (uni == 0xFFFF)
+				skip = TRUE;
+			else { /* uni != index , uni != 0xFFFF */
+				u16 col_index = get_col_index(index);
+
+				if (upcase_table[col_index] == NULL) {
+					DPRINTK("alloc = 0x%X\n", col_index);
+					upcase_table[col_index] = (u16 *) kmalloc(UTBL_ROW_COUNT * sizeof(u16), GFP_KERNEL);
+					if (upcase_table[col_index] == NULL) {
+						ret = FFS_MEMORYERR;
+						goto error;
+					}
+
+					for (j = 0; j < UTBL_ROW_COUNT; j++)
+						upcase_table[col_index][j] = (col_index << LOW_INDEX_BIT) | j;
+				}
+
+				upcase_table[col_index][get_row_index(index)] = uni;
+				index++;
+			}
+		}
+	}
+	if (index >= 0xFFFF && utbl_checksum == checksum) {
+		if (tmp_bh)
+			brelse(tmp_bh);
+		return FFS_SUCCESS;
+	}
+	ret = FFS_ERROR;
+error:
+	if (tmp_bh)
+		brelse(tmp_bh);
+	free_upcase_table(sb);
+	return ret;
+}
+
+s32 __load_default_upcase_table(struct super_block *sb)
+{
+	int i, ret = FFS_ERROR;
+	u32 j;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	u8	skip = FALSE;
+	u32	index = 0;
+	u16	uni = 0;
+	u16 **upcase_table;
+
+	upcase_table = p_fs->vol_utbl = (u16 **) kmalloc(UTBL_COL_COUNT * sizeof(u16 *), GFP_KERNEL);
+	if (upcase_table == NULL)
+		return FFS_MEMORYERR;
+	memset(upcase_table, 0, UTBL_COL_COUNT * sizeof(u16 *));
+
+	for (i = 0; index <= 0xFFFF && i < NUM_UPCASE*2; i += 2) {
+		uni = GET16(uni_upcase + i);
+		if (skip) {
+			DPRINTK("skip from 0x%X ", index);
+			index += uni;
+			DPRINTK("to 0x%X (amount of 0x%X)\n", index, uni);
+			skip = FALSE;
+		} else if (uni == index)
+			index++;
+		else if (uni == 0xFFFF)
+			skip = TRUE;
+		else { /* uni != index , uni != 0xFFFF */
+			u16 col_index = get_col_index(index);
+
+			if (upcase_table[col_index] == NULL) {
+				DPRINTK("alloc = 0x%X\n", col_index);
+				upcase_table[col_index] = (u16 *) kmalloc(UTBL_ROW_COUNT * sizeof(u16), GFP_KERNEL);
+				if (upcase_table[col_index] == NULL) {
+					ret = FFS_MEMORYERR;
+					goto error;
+				}
+
+				for (j = 0; j < UTBL_ROW_COUNT; j++)
+					upcase_table[col_index][j] = (col_index << LOW_INDEX_BIT) | j;
+			}
+
+			upcase_table[col_index][get_row_index(index)] = uni;
+			index++;
+		}
+	}
+
+	if (index >= 0xFFFF)
+		return FFS_SUCCESS;
+
+error:
+	/* FATAL error: default upcase table has error */
+	free_upcase_table(sb);
+	return ret;
+}
+
+s32 load_upcase_table(struct super_block *sb)
+{
+	int i;
+	u32 tbl_clu, tbl_size;
+	u32 type, sector, num_sectors;
+	CHAIN_T clu;
+	CASE_DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	clu.dir = p_fs->root_dir;
+	clu.flags = 0x01;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		for (i = 0; i < p_fs->dentries_per_clu; i++) {
+			ep = (CASE_DENTRY_T *) get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return FFS_MEDIAERR;
+
+			type = p_fs->fs_func->get_entry_type((DENTRY_T *) ep);
+
+			if (type == TYPE_UNUSED)
+				break;
+			if (type != TYPE_UPCASE)
+				continue;
+
+			tbl_clu  = GET32_A(ep->start_clu);
+			tbl_size = (u32) GET64_A(ep->size);
+
+			sector = START_SECTOR(tbl_clu);
+			num_sectors = ((tbl_size-1) >> p_bd->sector_size_bits) + 1;
+			if (__load_upcase_table(sb, sector, num_sectors, GET32_A(ep->checksum)) != FFS_SUCCESS)
+				break;
+			else
+				return FFS_SUCCESS;
+		}
+		if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+			return FFS_MEDIAERR;
+	}
+	/* load default upcase table */
+	return __load_default_upcase_table(sb);
+} /* end of load_upcase_table */
+
+void free_upcase_table(struct super_block *sb)
+{
+	u32 i;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	u16 **upcase_table;
+
+	upcase_table = p_fs->vol_utbl;
+	for (i = 0; i < UTBL_COL_COUNT; i++) {
+		if (upcase_table[i])
+			kfree(upcase_table[i]);
+	}
+
+	if (p_fs->vol_utbl)
+		kfree(p_fs->vol_utbl);
+	p_fs->vol_utbl = NULL;
+} /* end of free_upcase_table */
+
+/*
+ *  Directory Entry Management Functions
+ */
+
+u32 fat_get_entry_type(DENTRY_T *p_entry)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+
+	if (*(ep->name) == 0x0)
+		return TYPE_UNUSED;
+
+	else if (*(ep->name) == 0xE5)
+		return TYPE_DELETED;
+
+	else if (ep->attr == ATTR_EXTEND)
+		return TYPE_EXTEND;
+
+	else if ((ep->attr & (ATTR_SUBDIR|ATTR_VOLUME)) == ATTR_VOLUME)
+		return TYPE_VOLUME;
+
+	else if ((ep->attr & (ATTR_SUBDIR|ATTR_VOLUME)) == ATTR_SUBDIR)
+		return TYPE_DIR;
+
+	return TYPE_FILE;
+} /* end of fat_get_entry_type */
+
+u32 exfat_get_entry_type(DENTRY_T *p_entry)
+{
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+
+	if (ep->type == 0x0) {
+		return TYPE_UNUSED;
+	} else if (ep->type < 0x80) {
+		return TYPE_DELETED;
+	} else if (ep->type == 0x80) {
+		return TYPE_INVALID;
+	} else if (ep->type < 0xA0) {
+		if (ep->type == 0x81) {
+			return TYPE_BITMAP;
+		} else if (ep->type == 0x82) {
+			return TYPE_UPCASE;
+		} else if (ep->type == 0x83) {
+			return TYPE_VOLUME;
+		} else if (ep->type == 0x85) {
+			if (GET16_A(ep->attr) & ATTR_SUBDIR)
+				return TYPE_DIR;
+			else
+				return TYPE_FILE;
+		}
+		return TYPE_CRITICAL_PRI;
+	} else if (ep->type < 0xC0) {
+		if (ep->type == 0xA0)
+			return TYPE_GUID;
+		else if (ep->type == 0xA1)
+			return TYPE_PADDING;
+		else if (ep->type == 0xA2)
+			return TYPE_ACLTAB;
+		return TYPE_BENIGN_PRI;
+	} else if (ep->type < 0xE0) {
+		if (ep->type == 0xC0)
+			return TYPE_STREAM;
+		else if (ep->type == 0xC1)
+			return TYPE_EXTEND;
+		else if (ep->type == 0xC2)
+			return TYPE_ACL;
+		return TYPE_CRITICAL_SEC;
+	}
+
+	return TYPE_BENIGN_SEC;
+} /* end of exfat_get_entry_type */
+
+void fat_set_entry_type(DENTRY_T *p_entry, u32 type)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+
+	if (type == TYPE_UNUSED)
+		*(ep->name) = 0x0;
+
+	else if (type == TYPE_DELETED)
+		*(ep->name) = 0xE5;
+
+	else if (type == TYPE_EXTEND)
+		ep->attr = ATTR_EXTEND;
+
+	else if (type == TYPE_DIR)
+		ep->attr = ATTR_SUBDIR;
+
+	else if (type == TYPE_FILE)
+		ep->attr = ATTR_ARCHIVE;
+
+	else if (type == TYPE_SYMLINK)
+		ep->attr = ATTR_ARCHIVE | ATTR_SYMLINK;
+} /* end of fat_set_entry_type */
+
+void exfat_set_entry_type(DENTRY_T *p_entry, u32 type)
+{
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+
+	if (type == TYPE_UNUSED) {
+		ep->type = 0x0;
+	} else if (type == TYPE_DELETED) {
+		ep->type &= ~0x80;
+	} else if (type == TYPE_STREAM) {
+		ep->type = 0xC0;
+	} else if (type == TYPE_EXTEND) {
+		ep->type = 0xC1;
+	} else if (type == TYPE_BITMAP) {
+		ep->type = 0x81;
+	} else if (type == TYPE_UPCASE) {
+		ep->type = 0x82;
+	} else if (type == TYPE_VOLUME) {
+		ep->type = 0x83;
+	} else if (type == TYPE_DIR) {
+		ep->type = 0x85;
+		SET16_A(ep->attr, ATTR_SUBDIR);
+	} else if (type == TYPE_FILE) {
+		ep->type = 0x85;
+		SET16_A(ep->attr, ATTR_ARCHIVE);
+	} else if (type == TYPE_SYMLINK) {
+		ep->type = 0x85;
+		SET16_A(ep->attr, ATTR_ARCHIVE | ATTR_SYMLINK);
+	}
+} /* end of exfat_set_entry_type */
+
+u32 fat_get_entry_attr(DENTRY_T *p_entry)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	return (u32) ep->attr;
+} /* end of fat_get_entry_attr */
+
+u32 exfat_get_entry_attr(DENTRY_T *p_entry)
+{
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+	return (u32) GET16_A(ep->attr);
+} /* end of exfat_get_entry_attr */
+
+void fat_set_entry_attr(DENTRY_T *p_entry, u32 attr)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	ep->attr = (u8) attr;
+} /* end of fat_set_entry_attr */
+
+void exfat_set_entry_attr(DENTRY_T *p_entry, u32 attr)
+{
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+	SET16_A(ep->attr, (u16) attr);
+} /* end of exfat_set_entry_attr */
+
+u8 fat_get_entry_flag(DENTRY_T *p_entry)
+{
+	return 0x01;
+} /* end of fat_get_entry_flag */
+
+u8 exfat_get_entry_flag(DENTRY_T *p_entry)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	return ep->flags;
+} /* end of exfat_get_entry_flag */
+
+void fat_set_entry_flag(DENTRY_T *p_entry, u8 flags)
+{
+} /* end of fat_set_entry_flag */
+
+void exfat_set_entry_flag(DENTRY_T *p_entry, u8 flags)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	ep->flags = flags;
+} /* end of exfat_set_entry_flag */
+
+u32 fat_get_entry_clu0(DENTRY_T *p_entry)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	return ((u32) GET16_A(ep->start_clu_hi) << 16) | GET16_A(ep->start_clu_lo);
+} /* end of fat_get_entry_clu0 */
+
+u32 exfat_get_entry_clu0(DENTRY_T *p_entry)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	return GET32_A(ep->start_clu);
+} /* end of exfat_get_entry_clu0 */
+
+void fat_set_entry_clu0(DENTRY_T *p_entry, u32 start_clu)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	SET16_A(ep->start_clu_lo, CLUSTER_16(start_clu));
+	SET16_A(ep->start_clu_hi, CLUSTER_16(start_clu >> 16));
+} /* end of fat_set_entry_clu0 */
+
+void exfat_set_entry_clu0(DENTRY_T *p_entry, u32 start_clu)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	SET32_A(ep->start_clu, start_clu);
+} /* end of exfat_set_entry_clu0 */
+
+u64 fat_get_entry_size(DENTRY_T *p_entry)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	return (u64) GET32_A(ep->size);
+} /* end of fat_get_entry_size */
+
+u64 exfat_get_entry_size(DENTRY_T *p_entry)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	return GET64_A(ep->valid_size);
+} /* end of exfat_get_entry_size */
+
+void fat_set_entry_size(DENTRY_T *p_entry, u64 size)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	SET32_A(ep->size, (u32) size);
+} /* end of fat_set_entry_size */
+
+void exfat_set_entry_size(DENTRY_T *p_entry, u64 size)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	SET64_A(ep->valid_size, size);
+	SET64_A(ep->size, size);
+} /* end of exfat_set_entry_size */
+
+void fat_get_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode)
+{
+	u16 t = 0x00, d = 0x21;
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+
+	switch (mode) {
+	case TM_CREATE:
+		t = GET16_A(ep->create_time);
+		d = GET16_A(ep->create_date);
+		break;
+	case TM_MODIFY:
+		t = GET16_A(ep->modify_time);
+		d = GET16_A(ep->modify_date);
+		break;
+	}
+
+	tp->sec  = (t & 0x001F) << 1;
+	tp->min  = (t >> 5) & 0x003F;
+	tp->hour = (t >> 11);
+	tp->day  = (d & 0x001F);
+	tp->mon  = (d >> 5) & 0x000F;
+	tp->year = (d >> 9);
+} /* end of fat_get_entry_time */
+
+void exfat_get_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode)
+{
+	u16 t = 0x00, d = 0x21;
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+
+	switch (mode) {
+	case TM_CREATE:
+		t = GET16_A(ep->create_time);
+		d = GET16_A(ep->create_date);
+		break;
+	case TM_MODIFY:
+		t = GET16_A(ep->modify_time);
+		d = GET16_A(ep->modify_date);
+		break;
+	case TM_ACCESS:
+		t = GET16_A(ep->access_time);
+		d = GET16_A(ep->access_date);
+		break;
+	}
+
+	tp->sec  = (t & 0x001F) << 1;
+	tp->min  = (t >> 5) & 0x003F;
+	tp->hour = (t >> 11);
+	tp->day  = (d & 0x001F);
+	tp->mon  = (d >> 5) & 0x000F;
+	tp->year = (d >> 9);
+} /* end of exfat_get_entry_time */
+
+void fat_set_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode)
+{
+	u16 t, d;
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+
+	t = (tp->hour << 11) | (tp->min << 5) | (tp->sec >> 1);
+	d = (tp->year <<  9) | (tp->mon << 5) |  tp->day;
+
+	switch (mode) {
+	case TM_CREATE:
+		SET16_A(ep->create_time, t);
+		SET16_A(ep->create_date, d);
+		break;
+	case TM_MODIFY:
+		SET16_A(ep->modify_time, t);
+		SET16_A(ep->modify_date, d);
+		break;
+	}
+} /* end of fat_set_entry_time */
+
+void exfat_set_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode)
+{
+	u16 t, d;
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+
+	t = (tp->hour << 11) | (tp->min << 5) | (tp->sec >> 1);
+	d = (tp->year <<  9) | (tp->mon << 5) |  tp->day;
+
+	switch (mode) {
+	case TM_CREATE:
+		SET16_A(ep->create_time, t);
+		SET16_A(ep->create_date, d);
+		break;
+	case TM_MODIFY:
+		SET16_A(ep->modify_time, t);
+		SET16_A(ep->modify_date, d);
+		break;
+	case TM_ACCESS:
+		SET16_A(ep->access_time, t);
+		SET16_A(ep->access_date, d);
+		break;
+	}
+} /* end of exfat_set_entry_time */
+
+s32 fat_init_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type,
+						 u32 start_clu, u64 size)
+{
+	u32 sector;
+	DOS_DENTRY_T *dos_ep;
+
+	dos_ep = (DOS_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!dos_ep)
+		return FFS_MEDIAERR;
+
+	init_dos_entry(dos_ep, type, start_clu);
+	buf_modify(sb, sector);
+
+	return FFS_SUCCESS;
+} /* end of fat_init_dir_entry */
+
+s32 exfat_init_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type,
+						   u32 start_clu, u64 size)
+{
+	u32 sector;
+	u8 flags;
+	FILE_DENTRY_T *file_ep;
+	STRM_DENTRY_T *strm_ep;
+
+	flags = (type == TYPE_FILE) ? 0x01 : 0x03;
+
+	/* we cannot use get_entry_set_in_dir here because file ep is not initialized yet */
+	file_ep = (FILE_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!file_ep)
+		return FFS_MEDIAERR;
+
+	strm_ep = (STRM_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry+1, &sector);
+	if (!strm_ep)
+		return FFS_MEDIAERR;
+
+	init_file_entry(file_ep, type);
+	buf_modify(sb, sector);
+
+	init_strm_entry(strm_ep, flags, start_clu, size);
+	buf_modify(sb, sector);
+
+	return FFS_SUCCESS;
+} /* end of exfat_init_dir_entry */
+
+s32 fat_init_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 num_entries,
+						 UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname)
+{
+	int i;
+	u32 sector;
+	u8 chksum;
+	u16 *uniname = p_uniname->name;
+	DOS_DENTRY_T *dos_ep;
+	EXT_DENTRY_T *ext_ep;
+
+	dos_ep = (DOS_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!dos_ep)
+		return FFS_MEDIAERR;
+
+	dos_ep->lcase = p_dosname->name_case;
+	memcpy(dos_ep->name, p_dosname->name, DOS_NAME_LENGTH);
+	buf_modify(sb, sector);
+
+	if ((--num_entries) > 0) {
+		chksum = calc_checksum_1byte((void *) dos_ep->name, DOS_NAME_LENGTH, 0);
+
+		for (i = 1; i < num_entries; i++) {
+			ext_ep = (EXT_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry-i, &sector);
+			if (!ext_ep)
+				return FFS_MEDIAERR;
+
+			init_ext_entry(ext_ep, i, chksum, uniname);
+			buf_modify(sb, sector);
+			uniname += 13;
+		}
+
+		ext_ep = (EXT_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry-i, &sector);
+		if (!ext_ep)
+			return FFS_MEDIAERR;
+
+		init_ext_entry(ext_ep, i+0x40, chksum, uniname);
+		buf_modify(sb, sector);
+	}
+
+	return FFS_SUCCESS;
+} /* end of fat_init_ext_entry */
+
+s32 exfat_init_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 num_entries,
+						   UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname)
+{
+	int i;
+	u32 sector;
+	u16 *uniname = p_uniname->name;
+	FILE_DENTRY_T *file_ep;
+	STRM_DENTRY_T *strm_ep;
+	NAME_DENTRY_T *name_ep;
+
+	file_ep = (FILE_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!file_ep)
+		return FFS_MEDIAERR;
+
+	file_ep->num_ext = (u8)(num_entries - 1);
+	buf_modify(sb, sector);
+
+	strm_ep = (STRM_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry+1, &sector);
+	if (!strm_ep)
+		return FFS_MEDIAERR;
+
+	strm_ep->name_len = p_uniname->name_len;
+	SET16_A(strm_ep->name_hash, p_uniname->name_hash);
+	buf_modify(sb, sector);
+
+	for (i = 2; i < num_entries; i++) {
+		name_ep = (NAME_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry+i, &sector);
+		if (!name_ep)
+			return FFS_MEDIAERR;
+
+		init_name_entry(name_ep, uniname);
+		buf_modify(sb, sector);
+		uniname += 15;
+	}
+
+	update_dir_checksum(sb, p_dir, entry);
+
+	return FFS_SUCCESS;
+} /* end of exfat_init_ext_entry */
+
+void init_dos_entry(DOS_DENTRY_T *ep, u32 type, u32 start_clu)
+{
+	TIMESTAMP_T tm, *tp;
+
+	fat_set_entry_type((DENTRY_T *) ep, type);
+	SET16_A(ep->start_clu_lo, CLUSTER_16(start_clu));
+	SET16_A(ep->start_clu_hi, CLUSTER_16(start_clu >> 16));
+	SET32_A(ep->size, 0);
+
+	tp = tm_current(&tm);
+	fat_set_entry_time((DENTRY_T *) ep, tp, TM_CREATE);
+	fat_set_entry_time((DENTRY_T *) ep, tp, TM_MODIFY);
+	SET16_A(ep->access_date, 0);
+	ep->create_time_ms = 0;
+} /* end of init_dos_entry */
+
+void init_ext_entry(EXT_DENTRY_T *ep, s32 order, u8 chksum, u16 *uniname)
+{
+	int i;
+	u8 end = FALSE;
+
+	fat_set_entry_type((DENTRY_T *) ep, TYPE_EXTEND);
+	ep->order = (u8) order;
+	ep->sysid = 0;
+	ep->checksum = chksum;
+	SET16_A(ep->start_clu, 0);
+
+	for (i = 0; i < 10; i += 2) {
+		if (!end) {
+			SET16(ep->unicode_0_4+i, *uniname);
+			if (*uniname == 0x0)
+				end = TRUE;
+			else
+				uniname++;
+		} else {
+			SET16(ep->unicode_0_4+i, 0xFFFF);
+		}
+	}
+
+	for (i = 0; i < 12; i += 2) {
+		if (!end) {
+			SET16_A(ep->unicode_5_10+i, *uniname);
+			if (*uniname == 0x0)
+				end = TRUE;
+			else
+				uniname++;
+		} else {
+			SET16_A(ep->unicode_5_10+i, 0xFFFF);
+		}
+	}
+
+	for (i = 0; i < 4; i += 2) {
+		if (!end) {
+			SET16_A(ep->unicode_11_12+i, *uniname);
+			if (*uniname == 0x0)
+				end = TRUE;
+			else
+				uniname++;
+		} else {
+			SET16_A(ep->unicode_11_12+i, 0xFFFF);
+		}
+	}
+} /* end of init_ext_entry */
+
+void init_file_entry(FILE_DENTRY_T *ep, u32 type)
+{
+	TIMESTAMP_T tm, *tp;
+
+	exfat_set_entry_type((DENTRY_T *) ep, type);
+
+	tp = tm_current(&tm);
+	exfat_set_entry_time((DENTRY_T *) ep, tp, TM_CREATE);
+	exfat_set_entry_time((DENTRY_T *) ep, tp, TM_MODIFY);
+	exfat_set_entry_time((DENTRY_T *) ep, tp, TM_ACCESS);
+	ep->create_time_ms = 0;
+	ep->modify_time_ms = 0;
+	ep->access_time_ms = 0;
+} /* end of init_file_entry */
+
+void init_strm_entry(STRM_DENTRY_T *ep, u8 flags, u32 start_clu, u64 size)
+{
+	exfat_set_entry_type((DENTRY_T *) ep, TYPE_STREAM);
+	ep->flags = flags;
+	SET32_A(ep->start_clu, start_clu);
+	SET64_A(ep->valid_size, size);
+	SET64_A(ep->size, size);
+} /* end of init_strm_entry */
+
+void init_name_entry(NAME_DENTRY_T *ep, u16 *uniname)
+{
+	int i;
+
+	exfat_set_entry_type((DENTRY_T *) ep, TYPE_EXTEND);
+	ep->flags = 0x0;
+
+	for (i = 0; i < 30; i++, i++) {
+		SET16_A(ep->unicode_0_14+i, *uniname);
+		if (*uniname == 0x0)
+			break;
+		uniname++;
+	}
+} /* end of init_name_entry */
+
+void fat_delete_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 order, s32 num_entries)
+{
+	int i;
+	u32 sector;
+	DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	for (i = num_entries-1; i >= order; i--) {
+		ep = get_entry_in_dir(sb, p_dir, entry-i, &sector);
+		if (!ep)
+			return;
+
+		p_fs->fs_func->set_entry_type(ep, TYPE_DELETED);
+		buf_modify(sb, sector);
+	}
+} /* end of fat_delete_dir_entry */
+
+void exfat_delete_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 order, s32 num_entries)
+{
+	int i;
+	u32 sector;
+	DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	for (i = order; i < num_entries; i++) {
+		ep = get_entry_in_dir(sb, p_dir, entry+i, &sector);
+		if (!ep)
+			return;
+
+		p_fs->fs_func->set_entry_type(ep, TYPE_DELETED);
+		buf_modify(sb, sector);
+	}
+} /* end of exfat_delete_dir_entry */
+
+void update_dir_checksum(struct super_block *sb, CHAIN_T *p_dir, s32 entry)
+{
+	int i, num_entries;
+	u32 sector;
+	u16 chksum;
+	FILE_DENTRY_T *file_ep;
+	DENTRY_T *ep;
+
+	file_ep = (FILE_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!file_ep)
+		return;
+
+	buf_lock(sb, sector);
+
+	num_entries = (s32) file_ep->num_ext + 1;
+	chksum = calc_checksum_2byte((void *) file_ep, DENTRY_SIZE, 0, CS_DIR_ENTRY);
+
+	for (i = 1; i < num_entries; i++) {
+		ep = get_entry_in_dir(sb, p_dir, entry+i, NULL);
+		if (!ep) {
+			buf_unlock(sb, sector);
+			return;
+		}
+
+		chksum = calc_checksum_2byte((void *) ep, DENTRY_SIZE, chksum, CS_DEFAULT);
+	}
+
+	SET16_A(file_ep->checksum, chksum);
+	buf_modify(sb, sector);
+	buf_unlock(sb, sector);
+} /* end of update_dir_checksum */
+
+void update_dir_checksum_with_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es)
+{
+	DENTRY_T *ep;
+	u16 chksum = 0;
+	s32 chksum_type = CS_DIR_ENTRY, i;
+
+	ep = (DENTRY_T *)&(es->__buf);
+	for (i = 0; i < es->num_entries; i++) {
+		DPRINTK("update_dir_checksum_with_entry_set ep %p\n", ep);
+		chksum = calc_checksum_2byte((void *) ep, DENTRY_SIZE, chksum, chksum_type);
+		ep++;
+		chksum_type = CS_DEFAULT;
+	}
+
+	ep = (DENTRY_T *)&(es->__buf);
+	SET16_A(((FILE_DENTRY_T *)ep)->checksum, chksum);
+	write_whole_entry_set(sb, es);
+}
+
+static s32 _walk_fat_chain(struct super_block *sb, CHAIN_T *p_dir, s32 byte_offset, u32 *clu)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	s32 clu_offset;
+	u32 cur_clu;
+
+	clu_offset = byte_offset >> p_fs->cluster_size_bits;
+	cur_clu = p_dir->dir;
+
+	if (p_dir->flags == 0x03) {
+		cur_clu += clu_offset;
+	} else {
+		while (clu_offset > 0) {
+			if (FAT_read(sb, cur_clu, &cur_clu) == -1)
+				return FFS_MEDIAERR;
+			clu_offset--;
+		}
+	}
+
+	if (clu)
+		*clu = cur_clu;
+	return FFS_SUCCESS;
+}
+s32 find_location(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 *sector, s32 *offset)
+{
+	s32 off, ret;
+	u32 clu = 0;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	off = entry << DENTRY_SIZE_BITS;
+
+	if (p_dir->dir == CLUSTER_32(0)) { /* FAT16 root_dir */
+		*offset = off & p_bd->sector_size_mask;
+		*sector = off >> p_bd->sector_size_bits;
+		*sector += p_fs->root_start_sector;
+	} else {
+		ret = _walk_fat_chain(sb, p_dir, off, &clu);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		off &= p_fs->cluster_size - 1;	/* byte offset in cluster */
+
+		*offset = off & p_bd->sector_size_mask;	/* byte offset in sector    */
+		*sector = off >> p_bd->sector_size_bits;	/* sector offset in cluster */
+		*sector += START_SECTOR(clu);
+	}
+	return FFS_SUCCESS;
+} /* end of find_location */
+
+DENTRY_T *get_entry_with_sector(struct super_block *sb, u32 sector, s32 offset)
+{
+	u8 *buf;
+
+	buf = buf_getblk(sb, sector);
+
+	if (buf == NULL)
+		return NULL;
+
+	return (DENTRY_T *)(buf + offset);
+} /* end of get_entry_with_sector */
+
+DENTRY_T *get_entry_in_dir(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 *sector)
+{
+	s32 off;
+	u32 sec;
+	u8 *buf;
+
+	if (find_location(sb, p_dir, entry, &sec, &off) != FFS_SUCCESS)
+		return NULL;
+
+	buf = buf_getblk(sb, sec);
+
+	if (buf == NULL)
+		return NULL;
+
+	if (sector != NULL)
+		*sector = sec;
+	return (DENTRY_T *)(buf + off);
+} /* end of get_entry_in_dir */
+
+
+/* returns a set of dentries for a file or dir.
+ * Note that this is a copy (dump) of dentries so that user should call write_entry_set()
+ * to apply changes made in this entry set to the real device.
+ * in:
+ *   sb+p_dir+entry: indicates a file/dir
+ *   type:  specifies how many dentries should be included.
+ * out:
+ *   file_ep: will point the first dentry(= file dentry) on success
+ * return:
+ *   pointer of entry set on success,
+ *   NULL on failure.
+ */
+
+#define ES_MODE_STARTED				0
+#define ES_MODE_GET_FILE_ENTRY			1
+#define ES_MODE_GET_STRM_ENTRY			2
+#define ES_MODE_GET_NAME_ENTRY			3
+#define ES_MODE_GET_CRITICAL_SEC_ENTRY		4
+ENTRY_SET_CACHE_T *get_entry_set_in_dir(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type, DENTRY_T **file_ep)
+{
+	s32 off, ret, byte_offset;
+	u32 clu = 0;
+	u32 sec, entry_type;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	ENTRY_SET_CACHE_T *es = NULL;
+	DENTRY_T *ep, *pos;
+	u8 *buf;
+	u8 num_entries;
+	s32 mode = ES_MODE_STARTED;
+
+	DPRINTK("get_entry_set_in_dir entered\n");
+	DPRINTK("p_dir dir %u flags %x size %d\n", p_dir->dir, p_dir->flags, p_dir->size);
+
+	byte_offset = entry << DENTRY_SIZE_BITS;
+	ret = _walk_fat_chain(sb, p_dir, byte_offset, &clu);
+	if (ret != FFS_SUCCESS)
+		return NULL;
+
+
+	byte_offset &= p_fs->cluster_size - 1;	/* byte offset in cluster */
+
+	off = byte_offset & p_bd->sector_size_mask;	/* byte offset in sector    */
+	sec = byte_offset >> p_bd->sector_size_bits;	/* sector offset in cluster */
+	sec += START_SECTOR(clu);
+
+	buf = buf_getblk(sb, sec);
+	if (buf == NULL)
+		goto err_out;
+
+
+	ep = (DENTRY_T *)(buf + off);
+	entry_type = p_fs->fs_func->get_entry_type(ep);
+
+	if ((entry_type != TYPE_FILE)
+		&& (entry_type != TYPE_DIR))
+		goto err_out;
+
+	if (type == ES_ALL_ENTRIES)
+		num_entries = ((FILE_DENTRY_T *)ep)->num_ext+1;
+	else
+		num_entries = type;
+
+	DPRINTK("trying to kmalloc %zx bytes for %d entries\n", offsetof(ENTRY_SET_CACHE_T, __buf) + (num_entries)  * sizeof(DENTRY_T), num_entries);
+	es = kmalloc(offsetof(ENTRY_SET_CACHE_T, __buf) + (num_entries)  * sizeof(DENTRY_T), GFP_KERNEL);
+	if (es == NULL)
+		goto err_out;
+
+	es->num_entries = num_entries;
+	es->sector = sec;
+	es->offset = off;
+	es->alloc_flag = p_dir->flags;
+
+	pos = (DENTRY_T *) &(es->__buf);
+
+	while(num_entries) {
+		/* instead of copying whole sector, we will check every entry.
+		 * this will provide minimum stablity and consistancy.
+		 */
+
+		entry_type = p_fs->fs_func->get_entry_type(ep);
+
+		if ((entry_type == TYPE_UNUSED) || (entry_type == TYPE_DELETED))
+			goto err_out;
+
+		switch (mode) {
+		case ES_MODE_STARTED:
+			if  ((entry_type == TYPE_FILE) || (entry_type == TYPE_DIR))
+				mode = ES_MODE_GET_FILE_ENTRY;
+			else
+				goto err_out;
+			break;
+		case ES_MODE_GET_FILE_ENTRY:
+			if (entry_type == TYPE_STREAM)
+				mode = ES_MODE_GET_STRM_ENTRY;
+			else
+				goto err_out;
+			break;
+		case ES_MODE_GET_STRM_ENTRY:
+			if (entry_type == TYPE_EXTEND)
+				mode = ES_MODE_GET_NAME_ENTRY;
+			else
+				goto err_out;
+			break;
+		case ES_MODE_GET_NAME_ENTRY:
+			if (entry_type == TYPE_EXTEND)
+				break;
+			else if (entry_type == TYPE_STREAM)
+				goto err_out;
+			else if (entry_type & TYPE_CRITICAL_SEC)
+				mode = ES_MODE_GET_CRITICAL_SEC_ENTRY;
+			else
+				goto err_out;
+			break;
+		case ES_MODE_GET_CRITICAL_SEC_ENTRY:
+			if ((entry_type == TYPE_EXTEND) || (entry_type == TYPE_STREAM))
+				goto err_out;
+			else if ((entry_type & TYPE_CRITICAL_SEC) != TYPE_CRITICAL_SEC)
+				goto err_out;
+			break;
+		}
+
+		memcpy(pos, ep, sizeof(DENTRY_T));
+
+		if (--num_entries == 0)
+			break;
+
+		if (((off + DENTRY_SIZE) & p_bd->sector_size_mask) < (off &  p_bd->sector_size_mask)) {
+			/* get the next sector */
+			if (IS_LAST_SECTOR_IN_CLUSTER(sec)) {
+				if (es->alloc_flag == 0x03) {
+					clu++;
+				} else {
+					if (FAT_read(sb, clu, &clu) == -1)
+						goto err_out;
+				}
+				sec = START_SECTOR(clu);
+			} else {
+				sec++;
+			}
+			buf = buf_getblk(sb, sec);
+			if (buf == NULL)
+				goto err_out;
+			off = 0;
+			ep = (DENTRY_T *)(buf);
+		} else {
+			ep++;
+			off += DENTRY_SIZE;
+		}
+		pos++;
+	}
+
+	if (file_ep)
+		*file_ep = (DENTRY_T *)&(es->__buf);
+
+	DPRINTK("es sec %u offset %d flags %d, num_entries %u buf ptr %p\n",
+		   es->sector, es->offset, es->alloc_flag, es->num_entries, &(es->__buf));
+	DPRINTK("get_entry_set_in_dir exited %p\n", es);
+	return es;
+err_out:
+	DPRINTK("get_entry_set_in_dir exited NULL (es %p)\n", es);
+	if (es)
+		kfree(es);
+	return NULL;
+}
+
+void release_entry_set(ENTRY_SET_CACHE_T *es)
+{
+	DPRINTK("release_entry_set %p\n", es);
+	if (es)
+		kfree(es);
+}
+
+
+static s32 __write_partial_entries_in_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es, u32 sec, s32 off, u32 count)
+{
+	s32 num_entries, buf_off = (off - es->offset);
+	u32 remaining_byte_in_sector, copy_entries;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	u32 clu;
+	u8 *buf, *esbuf = (u8 *)&(es->__buf);
+
+	DPRINTK("__write_partial_entries_in_entry_set entered\n");
+	DPRINTK("es %p sec %u off %d count %d\n", es, sec, off, count);
+	num_entries = count;
+
+	while (num_entries) {
+		/* white per sector base */
+		remaining_byte_in_sector = (1 << p_bd->sector_size_bits) - off;
+		copy_entries = MIN(remaining_byte_in_sector >> DENTRY_SIZE_BITS , num_entries);
+		buf = buf_getblk(sb, sec);
+		if (buf == NULL)
+			goto err_out;
+		DPRINTK("es->buf %p buf_off %u\n", esbuf, buf_off);
+		DPRINTK("copying %d entries from %p to sector %u\n", copy_entries, (esbuf + buf_off), sec);
+		memcpy(buf + off, esbuf + buf_off, copy_entries << DENTRY_SIZE_BITS);
+		buf_modify(sb, sec);
+		num_entries -= copy_entries;
+
+		if (num_entries) {
+			/* get next sector */
+			if (IS_LAST_SECTOR_IN_CLUSTER(sec)) {
+				clu = GET_CLUSTER_FROM_SECTOR(sec);
+				if (es->alloc_flag == 0x03) {
+					clu++;
+				} else {
+					if (FAT_read(sb, clu, &clu) == -1)
+						goto err_out;
+				}
+				sec = START_SECTOR(clu);
+			} else {
+				sec++;
+			}
+			off = 0;
+			buf_off += copy_entries << DENTRY_SIZE_BITS;
+		}
+	}
+
+	DPRINTK("__write_partial_entries_in_entry_set exited successfully\n");
+	return FFS_SUCCESS;
+err_out:
+	DPRINTK("__write_partial_entries_in_entry_set failed\n");
+	return FFS_ERROR;
+}
+
+/* write back all entries in entry set */
+s32 write_whole_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es)
+{
+	return __write_partial_entries_in_entry_set(sb, es, es->sector, es->offset, es->num_entries);
+}
+
+/* write back some entries in entry set */
+s32 write_partial_entries_in_entry_set (struct super_block *sb, ENTRY_SET_CACHE_T *es, DENTRY_T *ep, u32 count)
+{
+	s32 ret, byte_offset, off;
+	u32 clu=0, sec;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	CHAIN_T dir;
+
+	/* vaidity check */
+	if (ep + count  > ((DENTRY_T *)&(es->__buf)) + es->num_entries)
+		return FFS_ERROR;
+
+	dir.dir = GET_CLUSTER_FROM_SECTOR(es->sector);
+	dir.flags = es->alloc_flag;
+	dir.size = 0xffffffff;		/* XXX */
+
+	byte_offset = (es->sector - START_SECTOR(dir.dir)) << p_bd->sector_size_bits;
+	byte_offset += ((void **)ep - &(es->__buf)) + es->offset;
+
+	ret =_walk_fat_chain(sb, &dir, byte_offset, &clu);
+	if (ret != FFS_SUCCESS)
+		return ret;
+	byte_offset &= p_fs->cluster_size - 1;	/* byte offset in cluster */
+	off = byte_offset & p_bd->sector_size_mask;	/* byte offset in sector    */
+	sec = byte_offset >> p_bd->sector_size_bits;	/* sector offset in cluster */
+	sec += START_SECTOR(clu);
+	return __write_partial_entries_in_entry_set(sb, es, sec, off, count);
+}
+
+/* search EMPTY CONTINUOUS "num_entries" entries */
+s32 search_deleted_or_unused_entry(struct super_block *sb, CHAIN_T *p_dir, s32 num_entries)
+{
+	int i, dentry, num_empty = 0;
+	s32 dentries_per_clu;
+	u32 type;
+	CHAIN_T clu;
+	DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	if (p_fs->hint_uentry.dir == p_dir->dir) {
+		if (p_fs->hint_uentry.entry == -1)
+			return -1;
+
+		clu.dir = p_fs->hint_uentry.clu.dir;
+		clu.size = p_fs->hint_uentry.clu.size;
+		clu.flags = p_fs->hint_uentry.clu.flags;
+
+		dentry = p_fs->hint_uentry.entry;
+	} else {
+		p_fs->hint_uentry.entry = -1;
+
+		clu.dir = p_dir->dir;
+		clu.size = p_dir->size;
+		clu.flags = p_dir->flags;
+
+		dentry = 0;
+	}
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+			i = dentry % dentries_per_clu;
+		else
+			i = dentry & (dentries_per_clu-1);
+
+		for (; i < dentries_per_clu; i++, dentry++) {
+			ep = get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return -1;
+
+			type = p_fs->fs_func->get_entry_type(ep);
+
+			if (type == TYPE_UNUSED) {
+				num_empty++;
+				if (p_fs->hint_uentry.entry == -1) {
+					p_fs->hint_uentry.dir = p_dir->dir;
+					p_fs->hint_uentry.entry = dentry;
+
+					p_fs->hint_uentry.clu.dir = clu.dir;
+					p_fs->hint_uentry.clu.size = clu.size;
+					p_fs->hint_uentry.clu.flags = clu.flags;
+				}
+			} else if (type == TYPE_DELETED) {
+				num_empty++;
+			} else {
+				num_empty = 0;
+			}
+
+			if (num_empty >= num_entries) {
+				p_fs->hint_uentry.dir = CLUSTER_32(~0);
+				p_fs->hint_uentry.entry = -1;
+
+				if (p_fs->vol_type == EXFAT)
+					return dentry - (num_entries-1);
+				else
+					return dentry;
+			}
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (clu.flags == 0x03) {
+			if ((--clu.size) > 0)
+				clu.dir++;
+			else
+				clu.dir = CLUSTER_32(~0);
+		} else {
+			if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+				return -1;
+		}
+	}
+
+	return -1;
+} /* end of search_deleted_or_unused_entry */
+
+s32 find_empty_entry(struct inode *inode, CHAIN_T *p_dir, s32 num_entries)
+{
+	s32 ret, dentry;
+	u32 last_clu, sector;
+	u64 size = 0;
+	CHAIN_T clu;
+	DENTRY_T *ep = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		return search_deleted_or_unused_entry(sb, p_dir, num_entries);
+
+	while ((dentry = search_deleted_or_unused_entry(sb, p_dir, num_entries)) < 0) {
+		if (p_fs->dev_ejected)
+			break;
+
+		if (p_fs->vol_type == EXFAT) {
+			if (p_dir->dir != p_fs->root_dir)
+				size = i_size_read(inode);
+		}
+
+		last_clu = find_last_cluster(sb, p_dir);
+		clu.dir = last_clu + 1;
+		clu.size = 0;
+		clu.flags = p_dir->flags;
+
+		/* (1) allocate a cluster */
+		ret = p_fs->fs_func->alloc_cluster(sb, 1, &clu);
+		if (ret < 1)
+			return -1;
+
+		if (clear_cluster(sb, clu.dir) != FFS_SUCCESS)
+			return -1;
+
+		/* (2) append to the FAT chain */
+		if (clu.flags != p_dir->flags) {
+			exfat_chain_cont_cluster(sb, p_dir->dir, p_dir->size);
+			p_dir->flags = 0x01;
+			p_fs->hint_uentry.clu.flags = 0x01;
+		}
+		if (clu.flags == 0x01)
+			if (FAT_write(sb, last_clu, clu.dir) < 0)
+				return -1;
+
+		if (p_fs->hint_uentry.entry == -1) {
+			p_fs->hint_uentry.dir = p_dir->dir;
+			p_fs->hint_uentry.entry = p_dir->size << (p_fs->cluster_size_bits - DENTRY_SIZE_BITS);
+
+			p_fs->hint_uentry.clu.dir = clu.dir;
+			p_fs->hint_uentry.clu.size = 0;
+			p_fs->hint_uentry.clu.flags = clu.flags;
+		}
+		p_fs->hint_uentry.clu.size++;
+		p_dir->size++;
+
+		/* (3) update the directory entry */
+		if (p_fs->vol_type == EXFAT) {
+			if (p_dir->dir != p_fs->root_dir) {
+				size += p_fs->cluster_size;
+
+				ep = get_entry_in_dir(sb, &(fid->dir), fid->entry+1, &sector);
+				if (!ep)
+					return -1;
+				p_fs->fs_func->set_entry_size(ep, size);
+				p_fs->fs_func->set_entry_flag(ep, p_dir->flags);
+				buf_modify(sb, sector);
+
+				update_dir_checksum(sb, &(fid->dir), fid->entry);
+			}
+		}
+
+		i_size_write(inode, i_size_read(inode)+p_fs->cluster_size);
+		EXFAT_I(inode)->mmu_private += p_fs->cluster_size;
+		EXFAT_I(inode)->fid.size += p_fs->cluster_size;
+		EXFAT_I(inode)->fid.flags = p_dir->flags;
+		inode->i_blocks += 1 << (p_fs->cluster_size_bits - 9);
+	}
+
+	return dentry;
+} /* end of find_empty_entry */
+
+/* return values of fat_find_dir_entry()
+   >= 0 : return dir entiry position with the name in dir
+   -1 : (root dir, ".") it is the root dir itself
+   -2 : entry with the name does not exist */
+s32 fat_find_dir_entry(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 num_entries, DOS_NAME_T *p_dosname, u32 type)
+{
+	int i, dentry = 0, lossy = FALSE, len;
+	s32 order = 0, is_feasible_entry = TRUE, has_ext_entry = FALSE;
+	s32 dentries_per_clu;
+	u32 entry_type;
+	u16 entry_uniname[14], *uniname = NULL, unichar;
+	CHAIN_T clu;
+	DENTRY_T *ep;
+	DOS_DENTRY_T *dos_ep;
+	EXT_DENTRY_T *ext_ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_dir->dir == p_fs->root_dir) {
+		if ((!nls_uniname_cmp(sb, p_uniname->name, (u16 *) UNI_CUR_DIR_NAME)) ||
+			(!nls_uniname_cmp(sb, p_uniname->name, (u16 *) UNI_PAR_DIR_NAME)))
+			return -1; // special case, root directory itself
+	}
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	clu.dir = p_dir->dir;
+	clu.flags = p_dir->flags;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < dentries_per_clu; i++, dentry++) {
+			ep = get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return -2;
+
+			entry_type = p_fs->fs_func->get_entry_type(ep);
+
+			if ((entry_type == TYPE_FILE) || (entry_type == TYPE_DIR)) {
+				if ((type == TYPE_ALL) || (type == entry_type)) {
+					if (is_feasible_entry && has_ext_entry)
+						return dentry;
+
+					dos_ep = (DOS_DENTRY_T *) ep;
+					if ((!lossy) && (!nls_dosname_cmp(sb, p_dosname->name, dos_ep->name)))
+						return dentry;
+				}
+				is_feasible_entry = TRUE;
+				has_ext_entry = FALSE;
+			} else if (entry_type == TYPE_EXTEND) {
+				if (is_feasible_entry) {
+					ext_ep = (EXT_DENTRY_T *) ep;
+					if (ext_ep->order > 0x40) {
+						order = (s32)(ext_ep->order - 0x40);
+						uniname = p_uniname->name + 13 * (order-1);
+					} else {
+						order = (s32) ext_ep->order;
+						uniname -= 13;
+					}
+
+					len = extract_uni_name_from_ext_entry(ext_ep, entry_uniname, order);
+
+					unichar = *(uniname+len);
+					*(uniname+len) = 0x0;
+
+					if (nls_uniname_cmp(sb, uniname, entry_uniname))
+						is_feasible_entry = FALSE;
+
+					*(uniname+len) = unichar;
+				}
+				has_ext_entry = TRUE;
+			} else if (entry_type == TYPE_UNUSED) {
+				return -2;
+			} else {
+				is_feasible_entry = TRUE;
+				has_ext_entry = FALSE;
+			}
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+			return -2;
+	}
+
+	return -2;
+} /* end of fat_find_dir_entry */
+
+/* return values of exfat_find_dir_entry()
+   >= 0 : return dir entiry position with the name in dir
+   -1 : (root dir, ".") it is the root dir itself
+   -2 : entry with the name does not exist */
+s32 exfat_find_dir_entry(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 num_entries, DOS_NAME_T *p_dosname, u32 type)
+{
+	int i, dentry = 0, num_ext_entries = 0, len;
+	s32 order = 0, is_feasible_entry = FALSE;
+	s32 dentries_per_clu, num_empty = 0;
+	u32 entry_type;
+	u16 entry_uniname[16], *uniname = NULL, unichar;
+	CHAIN_T clu;
+	DENTRY_T *ep;
+	FILE_DENTRY_T *file_ep;
+	STRM_DENTRY_T *strm_ep;
+	NAME_DENTRY_T *name_ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_dir->dir == p_fs->root_dir) {
+		if ((!nls_uniname_cmp(sb, p_uniname->name, (u16 *) UNI_CUR_DIR_NAME)) ||
+			(!nls_uniname_cmp(sb, p_uniname->name, (u16 *) UNI_PAR_DIR_NAME)))
+			return -1; // special case, root directory itself
+	}
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	clu.dir = p_dir->dir;
+	clu.size = p_dir->size;
+	clu.flags = p_dir->flags;
+
+	p_fs->hint_uentry.dir = p_dir->dir;
+	p_fs->hint_uentry.entry = -1;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < dentries_per_clu; i++, dentry++) {
+			ep = get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return -2;
+
+			entry_type = p_fs->fs_func->get_entry_type(ep);
+
+			if ((entry_type == TYPE_UNUSED) || (entry_type == TYPE_DELETED)) {
+				is_feasible_entry = FALSE;
+
+				if (p_fs->hint_uentry.entry == -1) {
+					num_empty++;
+
+					if (num_empty == 1) {
+						p_fs->hint_uentry.clu.dir = clu.dir;
+						p_fs->hint_uentry.clu.size = clu.size;
+						p_fs->hint_uentry.clu.flags = clu.flags;
+					}
+					if ((num_empty >= num_entries) || (entry_type == TYPE_UNUSED))
+						p_fs->hint_uentry.entry = dentry - (num_empty-1);
+				}
+
+				if (entry_type == TYPE_UNUSED)
+					return -2;
+			} else {
+				num_empty = 0;
+
+				if ((entry_type == TYPE_FILE) || (entry_type == TYPE_DIR)) {
+					if ((type == TYPE_ALL) || (type == entry_type)) {
+						file_ep = (FILE_DENTRY_T *) ep;
+						num_ext_entries = file_ep->num_ext;
+						is_feasible_entry = TRUE;
+					} else {
+						is_feasible_entry = FALSE;
+					}
+				} else if (entry_type == TYPE_STREAM) {
+					if (is_feasible_entry) {
+						strm_ep = (STRM_DENTRY_T *) ep;
+						if (p_uniname->name_len == strm_ep->name_len) {
+							order = 1;
+						} else {
+							is_feasible_entry = FALSE;
+						}
+					}
+				} else if (entry_type == TYPE_EXTEND) {
+					if (is_feasible_entry) {
+						name_ep = (NAME_DENTRY_T *) ep;
+
+						if ((++order) == 2)
+							uniname = p_uniname->name;
+						else
+							uniname += 15;
+
+						len = extract_uni_name_from_name_entry(name_ep, entry_uniname, order);
+
+						unichar = *(uniname+len);
+						*(uniname+len) = 0x0;
+
+						if (nls_uniname_cmp(sb, uniname, entry_uniname)) {
+							is_feasible_entry = FALSE;
+						} else if (order == num_ext_entries) {
+							p_fs->hint_uentry.dir = CLUSTER_32(~0);
+							p_fs->hint_uentry.entry = -1;
+							return dentry - (num_ext_entries);
+						}
+
+						*(uniname+len) = unichar;
+					}
+				} else {
+					is_feasible_entry = FALSE;
+				}
+			}
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (clu.flags == 0x03) {
+			if ((--clu.size) > 0)
+				clu.dir++;
+			else
+				clu.dir = CLUSTER_32(~0);
+		} else {
+			if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+				return -2;
+		}
+	}
+
+	return -2;
+} /* end of exfat_find_dir_entry */
+
+/* returns -1 on error */
+s32 fat_count_ext_entries(struct super_block *sb, CHAIN_T *p_dir, s32 entry, DENTRY_T *p_entry)
+{
+	s32 count = 0;
+	u8 chksum;
+	DOS_DENTRY_T *dos_ep = (DOS_DENTRY_T *) p_entry;
+	EXT_DENTRY_T *ext_ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	chksum = calc_checksum_1byte((void *) dos_ep->name, DOS_NAME_LENGTH, 0);
+
+	for (entry--; entry >= 0; entry--) {
+		ext_ep = (EXT_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, NULL);
+		if (!ext_ep)
+			return -1;
+
+		if ((p_fs->fs_func->get_entry_type((DENTRY_T *) ext_ep) == TYPE_EXTEND) &&
+			(ext_ep->checksum == chksum)) {
+			count++;
+			if (ext_ep->order > 0x40)
+				return count;
+		} else {
+			return count;
+		}
+	}
+
+	return count;
+} /* end of fat_count_ext_entries */
+
+/* returns -1 on error */
+s32 exfat_count_ext_entries(struct super_block *sb, CHAIN_T *p_dir, s32 entry, DENTRY_T *p_entry)
+{
+	int i, count = 0;
+	u32 type;
+	FILE_DENTRY_T *file_ep = (FILE_DENTRY_T *) p_entry;
+	DENTRY_T *ext_ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	for (i = 0, entry++; i < file_ep->num_ext; i++, entry++) {
+		ext_ep = get_entry_in_dir(sb, p_dir, entry, NULL);
+		if (!ext_ep)
+			return -1;
+
+		type = p_fs->fs_func->get_entry_type(ext_ep);
+		if ((type == TYPE_EXTEND) || (type == TYPE_STREAM))
+			count++;
+		else
+			return count;
+	}
+
+	return count;
+} /* end of exfat_count_ext_entries */
+
+/* returns -1 on error */
+s32 count_dos_name_entries(struct super_block *sb, CHAIN_T *p_dir, u32 type)
+{
+	int i, count = 0;
+	s32 dentries_per_clu;
+	u32 entry_type;
+	CHAIN_T clu;
+	DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	clu.dir = p_dir->dir;
+	clu.size = p_dir->size;
+	clu.flags = p_dir->flags;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < dentries_per_clu; i++) {
+			ep = get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return -1;
+
+			entry_type = p_fs->fs_func->get_entry_type(ep);
+
+			if (entry_type == TYPE_UNUSED)
+				return count;
+			if (!(type & TYPE_CRITICAL_PRI) && !(type & TYPE_BENIGN_PRI))
+				continue;
+
+			if ((type == TYPE_ALL) || (type == entry_type))
+				count++;
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (clu.flags == 0x03) {
+			if ((--clu.size) > 0)
+				clu.dir++;
+			else
+				clu.dir = CLUSTER_32(~0);
+		} else {
+			if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+				return -1;
+		}
+	}
+
+	return count;
+} /* end of count_dos_name_entries */
+
+bool is_dir_empty(struct super_block *sb, CHAIN_T *p_dir)
+{
+	int i, count = 0;
+	s32 dentries_per_clu;
+	u32 type;
+	CHAIN_T clu;
+	DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	clu.dir = p_dir->dir;
+	clu.size = p_dir->size;
+	clu.flags = p_dir->flags;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < dentries_per_clu; i++) {
+			ep = get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				break;
+
+			type = p_fs->fs_func->get_entry_type(ep);
+
+			if (type == TYPE_UNUSED)
+				return TRUE;
+			if ((type != TYPE_FILE) && (type != TYPE_DIR))
+				continue;
+
+			if (p_dir->dir == CLUSTER_32(0)) { /* FAT16 root_dir */
+				return FALSE;
+			} else {
+				if (p_fs->vol_type == EXFAT)
+					return FALSE;
+				if ((p_dir->dir == p_fs->root_dir) || ((++count) > 2))
+					return FALSE;
+			}
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (clu.flags == 0x03) {
+			if ((--clu.size) > 0)
+				clu.dir++;
+			else
+				clu.dir = CLUSTER_32(~0);
+		} else {
+			if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+				break;
+		}
+	}
+
+	return TRUE;
+} /* end of is_dir_empty */
+
+/*
+ *  Name Conversion Functions
+ */
+
+/* input  : dir, uni_name
+   output : num_of_entry, dos_name(format : aaaaaa~1.bbb) */
+s32 get_num_entries_and_dos_name(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 *entries, DOS_NAME_T *p_dosname)
+{
+	s32 ret, num_entries, lossy = FALSE;
+	char **r;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	num_entries = p_fs->fs_func->calc_num_entries(p_uniname);
+	if (num_entries == 0)
+		return FFS_INVALIDPATH;
+
+	if (p_fs->vol_type != EXFAT) {
+		nls_uniname_to_dosname(sb, p_dosname, p_uniname, &lossy);
+
+		if (lossy) {
+			ret = fat_generate_dos_name(sb, p_dir, p_dosname);
+			if (ret)
+				return ret;
+		} else {
+			for (r = reserved_names; *r; r++) {
+				if (!strncmp((void *) p_dosname->name, *r, 8))
+					return FFS_INVALIDPATH;
+			}
+
+			if (p_dosname->name_case != 0xFF)
+				num_entries = 1;
+		}
+
+		if (num_entries > 1)
+			p_dosname->name_case = 0x0;
+	}
+
+	*entries = num_entries;
+
+	return FFS_SUCCESS;
+} /* end of get_num_entries_and_dos_name */
+
+void get_uni_name_from_dos_entry(struct super_block *sb, DOS_DENTRY_T *ep, UNI_NAME_T *p_uniname, u8 mode)
+{
+	DOS_NAME_T dos_name;
+
+	if (mode == 0x0)
+		dos_name.name_case = 0x0;
+	else
+		dos_name.name_case = ep->lcase;
+
+	memcpy(dos_name.name, ep->name, DOS_NAME_LENGTH);
+	nls_dosname_to_uniname(sb, p_uniname, &dos_name);
+} /* end of get_uni_name_from_dos_entry */
+
+void fat_get_uni_name_from_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u16 *uniname)
+{
+	int i;
+	EXT_DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	for (entry--, i = 1; entry >= 0; entry--, i++) {
+		ep = (EXT_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, NULL);
+		if (!ep)
+			return;
+
+		if (p_fs->fs_func->get_entry_type((DENTRY_T *) ep) == TYPE_EXTEND) {
+			extract_uni_name_from_ext_entry(ep, uniname, i);
+			if (ep->order > 0x40)
+				return;
+		} else {
+			return;
+		}
+
+		uniname += 13;
+	}
+} /* end of fat_get_uni_name_from_ext_entry */
+
+void exfat_get_uni_name_from_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u16 *uniname)
+{
+	int i;
+	DENTRY_T *ep;
+	ENTRY_SET_CACHE_T *es;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	es = get_entry_set_in_dir(sb, p_dir, entry, ES_ALL_ENTRIES, &ep);
+	if (es == NULL || es->num_entries < 3) {
+		if (es)
+			release_entry_set(es);
+		return;
+	}
+
+	ep += 2;
+
+	/*
+	* First entry  : file entry
+	* Second entry : stream-extension entry
+	* Third entry  : first file-name entry
+	* So, the index of first file-name dentry should start from 2.
+	*/
+	for (i = 2; i < es->num_entries; i++, ep++) {
+		if (p_fs->fs_func->get_entry_type(ep) == TYPE_EXTEND)
+			extract_uni_name_from_name_entry((NAME_DENTRY_T *)ep, uniname, i);
+		else
+			goto out;
+		uniname += 15;
+	}
+
+out:
+	release_entry_set(es);
+} /* end of exfat_get_uni_name_from_ext_entry */
+
+s32 extract_uni_name_from_ext_entry(EXT_DENTRY_T *ep, u16 *uniname, s32 order)
+{
+	int i, len = 0;
+
+	for (i = 0; i < 10; i += 2) {
+		*uniname = GET16(ep->unicode_0_4+i);
+		if (*uniname == 0x0)
+			return len;
+		uniname++;
+		len++;
+	}
+
+	if (order < 20) {
+		for (i = 0; i < 12; i += 2) {
+			*uniname = GET16_A(ep->unicode_5_10+i);
+			if (*uniname == 0x0)
+				return len;
+			uniname++;
+			len++;
+		}
+	} else {
+		for (i = 0; i < 8; i += 2) {
+			*uniname = GET16_A(ep->unicode_5_10+i);
+			if (*uniname == 0x0)
+				return len;
+			uniname++;
+			len++;
+		}
+		*uniname = 0x0; /* uniname[MAX_NAME_LENGTH-1] */
+		return len;
+	}
+
+	for (i = 0; i < 4; i += 2) {
+		*uniname = GET16_A(ep->unicode_11_12+i);
+		if (*uniname == 0x0)
+			return len;
+		uniname++;
+		len++;
+	}
+
+	*uniname = 0x0;
+	return len;
+
+} /* end of extract_uni_name_from_ext_entry */
+
+s32 extract_uni_name_from_name_entry(NAME_DENTRY_T *ep, u16 *uniname, s32 order)
+{
+	int i, len = 0;
+
+	for (i = 0; i < 30; i += 2) {
+		*uniname = GET16_A(ep->unicode_0_14+i);
+		if (*uniname == 0x0)
+			return len;
+		uniname++;
+		len++;
+	}
+
+	*uniname = 0x0;
+	return len;
+
+} /* end of extract_uni_name_from_name_entry */
+
+s32 fat_generate_dos_name(struct super_block *sb, CHAIN_T *p_dir, DOS_NAME_T *p_dosname)
+{
+	int i, j, count = 0, count_begin = FALSE;
+	s32 dentries_per_clu;
+	u32 type;
+	u8 bmap[128/* 1 ~ 1023 */];
+	CHAIN_T clu;
+	DOS_DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	memset(bmap, 0, sizeof bmap);
+	exfat_bitmap_set(bmap, 0);
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	clu.dir = p_dir->dir;
+	clu.flags = p_dir->flags;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < dentries_per_clu; i++) {
+			ep = (DOS_DENTRY_T *) get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return FFS_MEDIAERR;
+
+			type = p_fs->fs_func->get_entry_type((DENTRY_T *) ep);
+
+			if (type == TYPE_UNUSED)
+				break;
+			if ((type != TYPE_FILE) && (type != TYPE_DIR))
+				continue;
+
+			count = 0;
+			count_begin = FALSE;
+
+			for (j = 0; j < 8; j++) {
+				if (ep->name[j] == ' ')
+					break;
+
+				if (ep->name[j] == '~') {
+					count_begin = TRUE;
+				} else if (count_begin) {
+					if ((ep->name[j] >= '0') && (ep->name[j] <= '9')) {
+						count = count * 10 + (ep->name[j] - '0');
+					} else {
+						count = 0;
+						count_begin = FALSE;
+					}
+				}
+			}
+
+			if ((count > 0) && (count < 1024))
+				exfat_bitmap_set(bmap, count);
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+			return FFS_MEDIAERR;
+	}
+
+	count = 0;
+	for (i = 0; i < 128; i++) {
+		if (bmap[i] != 0xFF) {
+			for (j = 0; j < 8; j++) {
+				if (exfat_bitmap_test(&(bmap[i]), j) == 0) {
+					count = (i << 3) + j;
+					break;
+				}
+			}
+			if (count != 0)
+				break;
+		}
+	}
+
+	if ((count == 0) || (count >= 1024))
+		return FFS_FILEEXIST;
+	else
+		fat_attach_count_to_dos_name(p_dosname->name, count);
+
+	/* Now dos_name has DOS~????.EXT */
+	return FFS_SUCCESS;
+} /* end of generate_dos_name */
+
+void fat_attach_count_to_dos_name(u8 *dosname, s32 count)
+{
+	int i, j, length;
+	char str_count[6];
+
+	snprintf(str_count, sizeof str_count, "~%d", count);
+	length = strlen(str_count);
+
+	i = j = 0;
+	while (j <= (8 - length)) {
+		i = j;
+		if (dosname[j] == ' ')
+			break;
+		if (dosname[j] & 0x80)
+			j += 2;
+		else
+			j++;
+	}
+
+	for (j = 0; j < length; i++, j++)
+		dosname[i] = (u8) str_count[j];
+
+	if (i == 7)
+		dosname[7] = ' ';
+
+} /* end of attach_count_to_dos_name */
+
+s32 fat_calc_num_entries(UNI_NAME_T *p_uniname)
+{
+	s32 len;
+
+	len = p_uniname->name_len;
+	if (len == 0)
+		return 0;
+
+	/* 1 dos name entry + extended entries */
+	return (len-1) / 13 + 2;
+
+} /* end of calc_num_enties */
+
+s32 exfat_calc_num_entries(UNI_NAME_T *p_uniname)
+{
+	s32 len;
+
+	len = p_uniname->name_len;
+	if (len == 0)
+		return 0;
+
+	/* 1 file entry + 1 stream entry + name entries */
+	return (len-1) / 15 + 3;
+
+} /* end of exfat_calc_num_enties */
+
+u8 calc_checksum_1byte(void *data, s32 len, u8 chksum)
+{
+	int i;
+	u8 *c = (u8 *) data;
+
+	for (i = 0; i < len; i++, c++)
+		chksum = (((chksum & 1) << 7) | ((chksum & 0xFE) >> 1)) + *c;
+
+	return chksum;
+} /* end of calc_checksum_1byte */
+
+u16 calc_checksum_2byte(void *data, s32 len, u16 chksum, s32 type)
+{
+	int i;
+	u8 *c = (u8 *) data;
+
+	switch (type) {
+	case CS_DIR_ENTRY:
+		for (i = 0; i < len; i++, c++) {
+			if ((i == 2) || (i == 3))
+				continue;
+			chksum = (((chksum & 1) << 15) | ((chksum & 0xFFFE) >> 1)) + (u16) *c;
+		}
+		break;
+	default
+			:
+		for (i = 0; i < len; i++, c++)
+			chksum = (((chksum & 1) << 15) | ((chksum & 0xFFFE) >> 1)) + (u16) *c;
+	}
+
+	return chksum;
+} /* end of calc_checksum_2byte */
+
+u32 calc_checksum_4byte(void *data, s32 len, u32 chksum, s32 type)
+{
+	int i;
+	u8 *c = (u8 *) data;
+
+	switch (type) {
+	case CS_PBR_SECTOR:
+		for (i = 0; i < len; i++, c++) {
+			if ((i == 106) || (i == 107) || (i == 112))
+				continue;
+			chksum = (((chksum & 1) << 31) | ((chksum & 0xFFFFFFFE) >> 1)) + (u32) *c;
+		}
+		break;
+	default
+			:
+		for (i = 0; i < len; i++, c++)
+			chksum = (((chksum & 1) << 31) | ((chksum & 0xFFFFFFFE) >> 1)) + (u32) *c;
+	}
+
+	return chksum;
+} /* end of calc_checksum_4byte */
+
+/*
+ *  Name Resolution Functions
+ */
+
+/* return values of resolve_path()
+   > 0 : return the length of the path
+   < 0 : return error */
+s32 resolve_path(struct inode *inode, char *path, CHAIN_T *p_dir, UNI_NAME_T *p_uniname)
+{
+	s32 lossy = FALSE;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+
+	if (strlen(path) >= (MAX_NAME_LENGTH * MAX_CHARSET_SIZE))
+		return FFS_INVALIDPATH;
+
+	strcpy(name_buf, path);
+
+	nls_cstring_to_uniname(sb, p_uniname, name_buf, &lossy);
+	if (lossy)
+		return FFS_INVALIDPATH;
+
+	fid->size = i_size_read(inode);
+
+	p_dir->dir = fid->start_clu;
+	p_dir->size = (s32)(fid->size >> p_fs->cluster_size_bits);
+	p_dir->flags = fid->flags;
+
+	return FFS_SUCCESS;
+}
+
+/*
+ *  File Operation Functions
+ */
+static FS_FUNC_T fat_fs_func = {
+	.alloc_cluster = fat_alloc_cluster,
+	.free_cluster = fat_free_cluster,
+	.count_used_clusters = fat_count_used_clusters,
+
+	.init_dir_entry = fat_init_dir_entry,
+	.init_ext_entry = fat_init_ext_entry,
+	.find_dir_entry = fat_find_dir_entry,
+	.delete_dir_entry = fat_delete_dir_entry,
+	.get_uni_name_from_ext_entry = fat_get_uni_name_from_ext_entry,
+	.count_ext_entries = fat_count_ext_entries,
+	.calc_num_entries = fat_calc_num_entries,
+
+	.get_entry_type = fat_get_entry_type,
+	.set_entry_type = fat_set_entry_type,
+	.get_entry_attr = fat_get_entry_attr,
+	.set_entry_attr = fat_set_entry_attr,
+	.get_entry_flag = fat_get_entry_flag,
+	.set_entry_flag = fat_set_entry_flag,
+	.get_entry_clu0 = fat_get_entry_clu0,
+	.set_entry_clu0 = fat_set_entry_clu0,
+	.get_entry_size = fat_get_entry_size,
+	.set_entry_size = fat_set_entry_size,
+	.get_entry_time = fat_get_entry_time,
+	.set_entry_time = fat_set_entry_time,
+};
+
+
+s32 fat16_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr)
+{
+	s32 num_reserved, num_root_sectors;
+	BPB16_T *p_bpb = (BPB16_T *) p_pbr->bpb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_bpb->num_fats == 0)
+		return FFS_FORMATERR;
+
+	num_root_sectors = GET16(p_bpb->num_root_entries) << DENTRY_SIZE_BITS;
+	num_root_sectors = ((num_root_sectors-1) >> p_bd->sector_size_bits) + 1;
+
+	p_fs->sectors_per_clu = p_bpb->sectors_per_clu;
+	p_fs->sectors_per_clu_bits = ilog2(p_bpb->sectors_per_clu);
+	p_fs->cluster_size_bits = p_fs->sectors_per_clu_bits + p_bd->sector_size_bits;
+	p_fs->cluster_size = 1 << p_fs->cluster_size_bits;
+
+	p_fs->num_FAT_sectors = GET16(p_bpb->num_fat_sectors);
+
+	p_fs->FAT1_start_sector = p_fs->PBR_sector + GET16(p_bpb->num_reserved);
+	if (p_bpb->num_fats == 1)
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector;
+	else
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector + p_fs->num_FAT_sectors;
+
+	p_fs->root_start_sector = p_fs->FAT2_start_sector + p_fs->num_FAT_sectors;
+	p_fs->data_start_sector = p_fs->root_start_sector + num_root_sectors;
+
+	p_fs->num_sectors = GET16(p_bpb->num_sectors);
+	if (p_fs->num_sectors == 0)
+		p_fs->num_sectors = GET32(p_bpb->num_huge_sectors);
+
+	num_reserved = p_fs->data_start_sector - p_fs->PBR_sector;
+	p_fs->num_clusters = ((p_fs->num_sectors - num_reserved) >> p_fs->sectors_per_clu_bits) + 2;
+	/* because the cluster index starts with 2 */
+
+	if (p_fs->num_clusters < FAT12_THRESHOLD)
+		p_fs->vol_type = FAT12;
+	else
+		p_fs->vol_type = FAT16;
+	p_fs->vol_id = GET32(p_bpb->vol_serial);
+
+	p_fs->root_dir = 0;
+	p_fs->dentries_in_root = GET16(p_bpb->num_root_entries);
+	p_fs->dentries_per_clu = 1 << (p_fs->cluster_size_bits - DENTRY_SIZE_BITS);
+
+	p_fs->vol_flag = VOL_CLEAN;
+	p_fs->clu_srch_ptr = 2;
+	p_fs->used_clusters = (u32) ~0;
+
+	p_fs->fs_func = &fat_fs_func;
+
+	return FFS_SUCCESS;
+} /* end of fat16_mount */
+
+s32 fat32_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr)
+{
+	s32 num_reserved;
+	BPB32_T *p_bpb = (BPB32_T *) p_pbr->bpb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_bpb->num_fats == 0)
+		return FFS_FORMATERR;
+
+	p_fs->sectors_per_clu = p_bpb->sectors_per_clu;
+	p_fs->sectors_per_clu_bits = ilog2(p_bpb->sectors_per_clu);
+	p_fs->cluster_size_bits = p_fs->sectors_per_clu_bits + p_bd->sector_size_bits;
+	p_fs->cluster_size = 1 << p_fs->cluster_size_bits;
+
+	p_fs->num_FAT_sectors = GET32(p_bpb->num_fat32_sectors);
+
+	p_fs->FAT1_start_sector = p_fs->PBR_sector + GET16(p_bpb->num_reserved);
+	if (p_bpb->num_fats == 1)
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector;
+	else
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector + p_fs->num_FAT_sectors;
+
+	p_fs->root_start_sector = p_fs->FAT2_start_sector + p_fs->num_FAT_sectors;
+	p_fs->data_start_sector = p_fs->root_start_sector;
+
+	p_fs->num_sectors = GET32(p_bpb->num_huge_sectors);
+	num_reserved = p_fs->data_start_sector - p_fs->PBR_sector;
+
+	p_fs->num_clusters = ((p_fs->num_sectors-num_reserved) >> p_fs->sectors_per_clu_bits) + 2;
+	/* because the cluster index starts with 2 */
+
+	p_fs->vol_type = FAT32;
+	p_fs->vol_id = GET32(p_bpb->vol_serial);
+
+	p_fs->root_dir = GET32(p_bpb->root_cluster);
+	p_fs->dentries_in_root = 0;
+	p_fs->dentries_per_clu = 1 << (p_fs->cluster_size_bits - DENTRY_SIZE_BITS);
+
+	p_fs->vol_flag = VOL_CLEAN;
+	p_fs->clu_srch_ptr = 2;
+	p_fs->used_clusters = (u32) ~0;
+
+	p_fs->fs_func = &fat_fs_func;
+
+	return FFS_SUCCESS;
+} /* end of fat32_mount */
+
+static FS_FUNC_T exfat_fs_func = {
+	.alloc_cluster = exfat_alloc_cluster,
+	.free_cluster = exfat_free_cluster,
+	.count_used_clusters = exfat_count_used_clusters,
+
+	.init_dir_entry = exfat_init_dir_entry,
+	.init_ext_entry = exfat_init_ext_entry,
+	.find_dir_entry = exfat_find_dir_entry,
+	.delete_dir_entry = exfat_delete_dir_entry,
+	.get_uni_name_from_ext_entry = exfat_get_uni_name_from_ext_entry,
+	.count_ext_entries = exfat_count_ext_entries,
+	.calc_num_entries = exfat_calc_num_entries,
+
+	.get_entry_type = exfat_get_entry_type,
+	.set_entry_type = exfat_set_entry_type,
+	.get_entry_attr = exfat_get_entry_attr,
+	.set_entry_attr = exfat_set_entry_attr,
+	.get_entry_flag = exfat_get_entry_flag,
+	.set_entry_flag = exfat_set_entry_flag,
+	.get_entry_clu0 = exfat_get_entry_clu0,
+	.set_entry_clu0 = exfat_set_entry_clu0,
+	.get_entry_size = exfat_get_entry_size,
+	.set_entry_size = exfat_set_entry_size,
+	.get_entry_time = exfat_get_entry_time,
+	.set_entry_time = exfat_set_entry_time,
+};
+
+s32 exfat_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr)
+{
+	BPBEX_T *p_bpb = (BPBEX_T *) p_pbr->bpb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_bpb->num_fats == 0)
+		return FFS_FORMATERR;
+
+	p_fs->sectors_per_clu = 1 << p_bpb->sectors_per_clu_bits;
+	p_fs->sectors_per_clu_bits = p_bpb->sectors_per_clu_bits;
+	p_fs->cluster_size_bits = p_fs->sectors_per_clu_bits + p_bd->sector_size_bits;
+	p_fs->cluster_size = 1 << p_fs->cluster_size_bits;
+
+	p_fs->num_FAT_sectors = GET32(p_bpb->fat_length);
+
+	p_fs->FAT1_start_sector = p_fs->PBR_sector + GET32(p_bpb->fat_offset);
+	if (p_bpb->num_fats == 1)
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector;
+	else
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector + p_fs->num_FAT_sectors;
+
+	p_fs->root_start_sector = p_fs->PBR_sector + GET32(p_bpb->clu_offset);
+	p_fs->data_start_sector = p_fs->root_start_sector;
+
+	p_fs->num_sectors = GET64(p_bpb->vol_length);
+	p_fs->num_clusters = GET32(p_bpb->clu_count) + 2;
+	/* because the cluster index starts with 2 */
+
+	p_fs->vol_type = EXFAT;
+	p_fs->vol_id = GET32(p_bpb->vol_serial);
+
+	p_fs->root_dir = GET32(p_bpb->root_cluster);
+	p_fs->dentries_in_root = 0;
+	p_fs->dentries_per_clu = 1 << (p_fs->cluster_size_bits - DENTRY_SIZE_BITS);
+
+	p_fs->vol_flag = (u32) GET16(p_bpb->vol_flags);
+	p_fs->clu_srch_ptr = 2;
+	p_fs->used_clusters = (u32) ~0;
+
+	p_fs->fs_func = &exfat_fs_func;
+
+	return FFS_SUCCESS;
+} /* end of exfat_mount */
+
+s32 create_dir(struct inode *inode, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, FILE_ID_T *fid)
+{
+	s32 ret, dentry, num_entries;
+	u64 size;
+	CHAIN_T clu;
+	DOS_NAME_T dos_name, dot_name;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	ret = get_num_entries_and_dos_name(sb, p_dir, p_uniname, &num_entries, &dos_name);
+	if (ret)
+		return ret;
+
+	/* find_empty_entry must be called before alloc_cluster */
+	dentry = find_empty_entry(inode, p_dir, num_entries);
+	if (dentry < 0)
+		return FFS_FULL;
+
+	clu.dir = CLUSTER_32(~0);
+	clu.size = 0;
+	clu.flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+
+	/* (1) allocate a cluster */
+	ret = p_fs->fs_func->alloc_cluster(sb, 1, &clu);
+	if (ret < 0)
+		return FFS_MEDIAERR;
+	else if (ret == 0)
+		return FFS_FULL;
+
+	ret = clear_cluster(sb, clu.dir);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	if (p_fs->vol_type == EXFAT) {
+		size = p_fs->cluster_size;
+	} else {
+		size = 0;
+
+		/* initialize the . and .. entry
+		   Information for . points to itself
+		   Information for .. points to parent dir */
+
+		dot_name.name_case = 0x0;
+		memcpy(dot_name.name, DOS_CUR_DIR_NAME, DOS_NAME_LENGTH);
+
+		ret = p_fs->fs_func->init_dir_entry(sb, &clu, 0, TYPE_DIR, clu.dir, 0);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		ret = p_fs->fs_func->init_ext_entry(sb, &clu, 0, 1, NULL, &dot_name);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		memcpy(dot_name.name, DOS_PAR_DIR_NAME, DOS_NAME_LENGTH);
+
+		if (p_dir->dir == p_fs->root_dir)
+			ret = p_fs->fs_func->init_dir_entry(sb, &clu, 1, TYPE_DIR, CLUSTER_32(0), 0);
+		else
+			ret = p_fs->fs_func->init_dir_entry(sb, &clu, 1, TYPE_DIR, p_dir->dir, 0);
+
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		ret = p_fs->fs_func->init_ext_entry(sb, &clu, 1, 1, NULL, &dot_name);
+		if (ret != FFS_SUCCESS)
+			return ret;
+	}
+
+	/* (2) update the directory entry */
+	/* make sub-dir entry in parent directory */
+	ret = p_fs->fs_func->init_dir_entry(sb, p_dir, dentry, TYPE_DIR, clu.dir, size);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	ret = p_fs->fs_func->init_ext_entry(sb, p_dir, dentry, num_entries, p_uniname, &dos_name);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	fid->dir.dir = p_dir->dir;
+	fid->dir.size = p_dir->size;
+	fid->dir.flags = p_dir->flags;
+	fid->entry = dentry;
+
+	fid->attr = ATTR_SUBDIR;
+	fid->flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+	fid->size = size;
+	fid->start_clu = clu.dir;
+
+	fid->type = TYPE_DIR;
+	fid->rwoffset = 0;
+	fid->hint_last_off = -1;
+
+	return FFS_SUCCESS;
+} /* end of create_dir */
+
+s32 create_file(struct inode *inode, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, u8 mode, FILE_ID_T *fid)
+{
+	s32 ret, dentry, num_entries;
+	DOS_NAME_T dos_name;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	ret = get_num_entries_and_dos_name(sb, p_dir, p_uniname, &num_entries, &dos_name);
+	if (ret)
+		return ret;
+
+	/* find_empty_entry must be called before alloc_cluster() */
+	dentry = find_empty_entry(inode, p_dir, num_entries);
+	if (dentry < 0)
+		return FFS_FULL;
+
+	/* (1) update the directory entry */
+	/* fill the dos name directory entry information of the created file.
+	   the first cluster is not determined yet. (0) */
+	ret = p_fs->fs_func->init_dir_entry(sb, p_dir, dentry, TYPE_FILE | mode, CLUSTER_32(0), 0);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	ret = p_fs->fs_func->init_ext_entry(sb, p_dir, dentry, num_entries, p_uniname, &dos_name);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	fid->dir.dir = p_dir->dir;
+	fid->dir.size = p_dir->size;
+	fid->dir.flags = p_dir->flags;
+	fid->entry = dentry;
+
+	fid->attr = ATTR_ARCHIVE | mode;
+	fid->flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+	fid->size = 0;
+	fid->start_clu = CLUSTER_32(~0);
+
+	fid->type = TYPE_FILE;
+	fid->rwoffset = 0;
+	fid->hint_last_off = -1;
+
+	return FFS_SUCCESS;
+} /* end of create_file */
+
+void remove_file(struct inode *inode, CHAIN_T *p_dir, s32 entry)
+{
+	s32 num_entries;
+	u32 sector;
+	DENTRY_T *ep;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	ep = get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!ep)
+		return;
+
+	buf_lock(sb, sector);
+
+	/* buf_lock() before call count_ext_entries() */
+	num_entries = p_fs->fs_func->count_ext_entries(sb, p_dir, entry, ep);
+	if (num_entries < 0) {
+		buf_unlock(sb, sector);
+		return;
+	}
+	num_entries++;
+
+	buf_unlock(sb, sector);
+
+	/* (1) update the directory entry */
+	p_fs->fs_func->delete_dir_entry(sb, p_dir, entry, 0, num_entries);
+} /* end of remove_file */
+
+s32 rename_file(struct inode *inode, CHAIN_T *p_dir, s32 oldentry, UNI_NAME_T *p_uniname, FILE_ID_T *fid)
+{
+	s32 ret, newentry = -1, num_old_entries, num_new_entries;
+	u32 sector_old, sector_new;
+	DOS_NAME_T dos_name;
+	DENTRY_T *epold, *epnew;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	epold = get_entry_in_dir(sb, p_dir, oldentry, &sector_old);
+	if (!epold)
+		return FFS_MEDIAERR;
+
+	buf_lock(sb, sector_old);
+
+	/* buf_lock() before call count_ext_entries() */
+	num_old_entries = p_fs->fs_func->count_ext_entries(sb, p_dir, oldentry, epold);
+	if (num_old_entries < 0) {
+		buf_unlock(sb, sector_old);
+		return FFS_MEDIAERR;
+	}
+	num_old_entries++;
+
+	ret = get_num_entries_and_dos_name(sb, p_dir, p_uniname, &num_new_entries, &dos_name);
+	if (ret) {
+		buf_unlock(sb, sector_old);
+		return ret;
+	}
+
+	if (num_old_entries < num_new_entries) {
+		newentry = find_empty_entry(inode, p_dir, num_new_entries);
+		if (newentry < 0) {
+			buf_unlock(sb, sector_old);
+			return FFS_FULL;
+		}
+
+		epnew = get_entry_in_dir(sb, p_dir, newentry, &sector_new);
+		if (!epnew) {
+			buf_unlock(sb, sector_old);
+			return FFS_MEDIAERR;
+		}
+
+		memcpy((void *) epnew, (void *) epold, DENTRY_SIZE);
+		if (p_fs->fs_func->get_entry_type(epnew) == TYPE_FILE) {
+			p_fs->fs_func->set_entry_attr(epnew, p_fs->fs_func->get_entry_attr(epnew) | ATTR_ARCHIVE);
+			fid->attr |= ATTR_ARCHIVE;
+		}
+		buf_modify(sb, sector_new);
+		buf_unlock(sb, sector_old);
+
+		if (p_fs->vol_type == EXFAT) {
+			epold = get_entry_in_dir(sb, p_dir, oldentry+1, &sector_old);
+			buf_lock(sb, sector_old);
+			epnew = get_entry_in_dir(sb, p_dir, newentry+1, &sector_new);
+
+			if (!epold || !epnew) {
+				buf_unlock(sb, sector_old);
+				return FFS_MEDIAERR;
+			}
+
+			memcpy((void *) epnew, (void *) epold, DENTRY_SIZE);
+			buf_modify(sb, sector_new);
+			buf_unlock(sb, sector_old);
+		}
+
+		ret = p_fs->fs_func->init_ext_entry(sb, p_dir, newentry, num_new_entries, p_uniname, &dos_name);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		p_fs->fs_func->delete_dir_entry(sb, p_dir, oldentry, 0, num_old_entries);
+		fid->entry = newentry;
+	} else {
+		if (p_fs->fs_func->get_entry_type(epold) == TYPE_FILE) {
+			p_fs->fs_func->set_entry_attr(epold, p_fs->fs_func->get_entry_attr(epold) | ATTR_ARCHIVE);
+			fid->attr |= ATTR_ARCHIVE;
+		}
+		buf_modify(sb, sector_old);
+		buf_unlock(sb, sector_old);
+
+		ret = p_fs->fs_func->init_ext_entry(sb, p_dir, oldentry, num_new_entries, p_uniname, &dos_name);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		p_fs->fs_func->delete_dir_entry(sb, p_dir, oldentry, num_new_entries, num_old_entries);
+	}
+
+	return FFS_SUCCESS;
+} /* end of rename_file */
+
+s32 move_file(struct inode *inode, CHAIN_T *p_olddir, s32 oldentry, CHAIN_T *p_newdir, UNI_NAME_T *p_uniname, FILE_ID_T *fid)
+{
+	s32 ret, newentry, num_new_entries, num_old_entries;
+	u32 sector_mov, sector_new;
+	CHAIN_T clu;
+	DOS_NAME_T dos_name;
+	DENTRY_T *epmov, *epnew;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	epmov = get_entry_in_dir(sb, p_olddir, oldentry, &sector_mov);
+	if (!epmov)
+		return FFS_MEDIAERR;
+
+	/* check if the source and target directory is the same */
+	if (p_fs->fs_func->get_entry_type(epmov) == TYPE_DIR &&
+		p_fs->fs_func->get_entry_clu0(epmov) == p_newdir->dir)
+		return FFS_INVALIDPATH;
+
+	buf_lock(sb, sector_mov);
+
+	/* buf_lock() before call count_ext_entries() */
+	num_old_entries = p_fs->fs_func->count_ext_entries(sb, p_olddir, oldentry, epmov);
+	if (num_old_entries < 0) {
+		buf_unlock(sb, sector_mov);
+		return FFS_MEDIAERR;
+	}
+	num_old_entries++;
+
+	ret = get_num_entries_and_dos_name(sb, p_newdir, p_uniname, &num_new_entries, &dos_name);
+	if (ret) {
+		buf_unlock(sb, sector_mov);
+		return ret;
+	}
+
+	newentry = find_empty_entry(inode, p_newdir, num_new_entries);
+	if (newentry < 0) {
+		buf_unlock(sb, sector_mov);
+		return FFS_FULL;
+	}
+
+	epnew = get_entry_in_dir(sb, p_newdir, newentry, &sector_new);
+	if (!epnew) {
+		buf_unlock(sb, sector_mov);
+		return FFS_MEDIAERR;
+	}
+
+	memcpy((void *) epnew, (void *) epmov, DENTRY_SIZE);
+	if (p_fs->fs_func->get_entry_type(epnew) == TYPE_FILE) {
+		p_fs->fs_func->set_entry_attr(epnew, p_fs->fs_func->get_entry_attr(epnew) | ATTR_ARCHIVE);
+		fid->attr |= ATTR_ARCHIVE;
+	}
+	buf_modify(sb, sector_new);
+	buf_unlock(sb, sector_mov);
+
+	if (p_fs->vol_type == EXFAT) {
+		epmov = get_entry_in_dir(sb, p_olddir, oldentry+1, &sector_mov);
+		buf_lock(sb, sector_mov);
+		epnew = get_entry_in_dir(sb, p_newdir, newentry+1, &sector_new);
+		if (!epmov || !epnew) {
+			buf_unlock(sb, sector_mov);
+			return FFS_MEDIAERR;
+		}
+
+		memcpy((void *) epnew, (void *) epmov, DENTRY_SIZE);
+		buf_modify(sb, sector_new);
+		buf_unlock(sb, sector_mov);
+	} else if (p_fs->fs_func->get_entry_type(epnew) == TYPE_DIR) {
+		/* change ".." pointer to new parent dir */
+		clu.dir = p_fs->fs_func->get_entry_clu0(epnew);
+		clu.flags = 0x01;
+
+		epnew = get_entry_in_dir(sb, &clu, 1, &sector_new);
+		if (!epnew)
+			return FFS_MEDIAERR;
+
+		if (p_newdir->dir == p_fs->root_dir)
+			p_fs->fs_func->set_entry_clu0(epnew, CLUSTER_32(0));
+		else
+			p_fs->fs_func->set_entry_clu0(epnew, p_newdir->dir);
+		buf_modify(sb, sector_new);
+	}
+
+	ret = p_fs->fs_func->init_ext_entry(sb, p_newdir, newentry, num_new_entries, p_uniname, &dos_name);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	p_fs->fs_func->delete_dir_entry(sb, p_olddir, oldentry, 0, num_old_entries);
+
+	fid->dir.dir = p_newdir->dir;
+	fid->dir.size = p_newdir->size;
+	fid->dir.flags = p_newdir->flags;
+
+	fid->entry = newentry;
+
+	return FFS_SUCCESS;
+} /* end of move_file */
+
+/*
+ *  Sector Read/Write Functions
+ */
+
+s32 sector_read(struct super_block *sb, u32 sec, struct buffer_head **bh, s32 read)
+{
+	s32 ret = FFS_MEDIAERR;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if ((sec >= (p_fs->PBR_sector+p_fs->num_sectors)) && (p_fs->num_sectors > 0)) {
+		printk("[EXFAT] sector_read: out of range error! (sec = %d)\n", sec);
+		fs_error(sb);
+		return ret;
+	}
+
+	if (!p_fs->dev_ejected) {
+		ret = bdev_read(sb, sec, bh, 1, read);
+		if (ret != FFS_SUCCESS)
+			p_fs->dev_ejected = TRUE;
+	}
+
+	return ret;
+} /* end of sector_read */
+
+s32 sector_write(struct super_block *sb, u32 sec, struct buffer_head *bh, s32 sync)
+{
+	s32 ret = FFS_MEDIAERR;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (sec >= (p_fs->PBR_sector+p_fs->num_sectors) && (p_fs->num_sectors > 0)) {
+		printk("[EXFAT] sector_write: out of range error! (sec = %d)\n", sec);
+		fs_error(sb);
+		return ret;
+	}
+
+	if (bh == NULL) {
+		printk("[EXFAT] sector_write: bh is NULL!\n");
+		fs_error(sb);
+		return ret;
+	}
+
+	if (!p_fs->dev_ejected) {
+		ret = bdev_write(sb, sec, bh, 1, sync);
+		if (ret != FFS_SUCCESS)
+			p_fs->dev_ejected = TRUE;
+	}
+
+	return ret;
+} /* end of sector_write */
+
+s32 multi_sector_read(struct super_block *sb, u32 sec, struct buffer_head **bh, s32 num_secs, s32 read)
+{
+	s32 ret = FFS_MEDIAERR;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (((sec+num_secs) > (p_fs->PBR_sector+p_fs->num_sectors)) && (p_fs->num_sectors > 0)) {
+		printk("[EXFAT] multi_sector_read: out of range error! (sec = %d, num_secs = %d)\n", sec, num_secs);
+		fs_error(sb);
+		return ret;
+	}
+
+	if (!p_fs->dev_ejected) {
+		ret = bdev_read(sb, sec, bh, num_secs, read);
+		if (ret != FFS_SUCCESS)
+			p_fs->dev_ejected = TRUE;
+	}
+
+	return ret;
+} /* end of multi_sector_read */
+
+s32 multi_sector_write(struct super_block *sb, u32 sec, struct buffer_head *bh, s32 num_secs, s32 sync)
+{
+	s32 ret = FFS_MEDIAERR;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if ((sec+num_secs) > (p_fs->PBR_sector+p_fs->num_sectors) && (p_fs->num_sectors > 0)) {
+		printk("[EXFAT] multi_sector_write: out of range error! (sec = %d, num_secs = %d)\n", sec, num_secs);
+		fs_error(sb);
+		return ret;
+	}
+	if (bh == NULL) {
+		printk("[EXFAT] multi_sector_write: bh is NULL!\n");
+		fs_error(sb);
+		return ret;
+	}
+
+	if (!p_fs->dev_ejected) {
+		ret = bdev_write(sb, sec, bh, num_secs, sync);
+		if (ret != FFS_SUCCESS)
+			p_fs->dev_ejected = TRUE;
+	}
+
+	return ret;
+} /* end of multi_sector_write */
diff --git b/fs/exfat/exfat_core.h b/fs/exfat/exfat_core.h
new file mode 100644
index 0000000..4bdfe4e
--- /dev/null
+++ b/fs/exfat/exfat_core.h
@@ -0,0 +1,671 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_core.h                                              */
+/*  PURPOSE : Header File for exFAT File Manager                        */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_H
+#define _EXFAT_H
+
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+
+#include "exfat_blkdev.h"
+#include "exfat_cache.h"
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_cache.h"
+
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+  /* For Debugging Purpose */
+	/* IOCTL code 'f' used by
+	 *   - file systems typically #0~0x1F
+	 *   - embedded terminal devices #128~
+	 *   - exts for debugging purpose #99
+	 * number 100 and 101 is availble now but has possible conflicts
+	 */
+#define EXFAT_IOC_GET_DEBUGFLAGS       _IOR('f', 100, long)
+#define EXFAT_IOC_SET_DEBUGFLAGS       _IOW('f', 101, long)
+
+#define EXFAT_DEBUGFLAGS_INVALID_UMOUNT        0x01
+#define EXFAT_DEBUGFLAGS_ERROR_RW              0x02
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+	/*----------------------------------------------------------------------*/
+	/*  Constant & Macro Definitions                                        */
+	/*----------------------------------------------------------------------*/
+
+#define DENTRY_SIZE             32          /* dir entry size */
+#define DENTRY_SIZE_BITS        5
+
+/* PBR entries */
+#define PBR_SIGNATURE           0xAA55
+#define EXT_SIGNATURE           0xAA550000
+#define VOL_LABEL               "NO NAME    " /* size should be 11 */
+#define OEM_NAME                "MSWIN4.1"  /* size should be 8 */
+#define STR_FAT12               "FAT12   "  /* size should be 8 */
+#define STR_FAT16               "FAT16   "  /* size should be 8 */
+#define STR_FAT32               "FAT32   "  /* size should be 8 */
+#define STR_EXFAT               "EXFAT   "  /* size should be 8 */
+#define VOL_CLEAN               0x0000
+#define VOL_DIRTY               0x0002
+
+/* max number of clusters */
+#define FAT12_THRESHOLD         4087        /* 2^12 - 1 + 2 (clu 0 & 1) */
+#define FAT16_THRESHOLD         65527       /* 2^16 - 1 + 2 */
+#define FAT32_THRESHOLD         268435457   /* 2^28 - 1 + 2 */
+#define EXFAT_THRESHOLD         268435457   /* 2^28 - 1 + 2 */
+
+/* file types */
+#define TYPE_UNUSED             0x0000
+#define TYPE_DELETED            0x0001
+#define TYPE_INVALID            0x0002
+#define TYPE_CRITICAL_PRI       0x0100
+#define TYPE_BITMAP             0x0101
+#define TYPE_UPCASE             0x0102
+#define TYPE_VOLUME             0x0103
+#define TYPE_DIR                0x0104
+#define TYPE_FILE               0x011F
+#define TYPE_SYMLINK            0x015F
+#define TYPE_CRITICAL_SEC       0x0200
+#define TYPE_STREAM             0x0201
+#define TYPE_EXTEND             0x0202
+#define TYPE_ACL                0x0203
+#define TYPE_BENIGN_PRI         0x0400
+#define TYPE_GUID               0x0401
+#define TYPE_PADDING            0x0402
+#define TYPE_ACLTAB             0x0403
+#define TYPE_BENIGN_SEC         0x0800
+#define TYPE_ALL                0x0FFF
+
+/* time modes */
+#define TM_CREATE               0
+#define TM_MODIFY               1
+#define TM_ACCESS               2
+
+/* checksum types */
+#define CS_DIR_ENTRY            0
+#define CS_PBR_SECTOR           1
+#define CS_DEFAULT              2
+
+#define CLUSTER_16(x)           ((u16)(x))
+#define CLUSTER_32(x)           ((u32)(x))
+
+#define FALSE			0
+#define TRUE			1
+
+#define MIN(a, b)		(((a) < (b)) ? (a) : (b))
+#define MAX(a, b)		(((a) > (b)) ? (a) : (b))
+
+#define START_SECTOR(x) \
+	((((x) - 2) << p_fs->sectors_per_clu_bits) + p_fs->data_start_sector)
+
+#define IS_LAST_SECTOR_IN_CLUSTER(sec) \
+		((((sec) - p_fs->data_start_sector + 1) & ((1 <<  p_fs->sectors_per_clu_bits) - 1)) == 0)
+
+#define GET_CLUSTER_FROM_SECTOR(sec)			\
+		((((sec) - p_fs->data_start_sector) >> p_fs->sectors_per_clu_bits) + 2)
+
+#define GET16(p_src) \
+	(((u16)(p_src)[0]) | (((u16)(p_src)[1]) << 8))
+#define GET32(p_src) \
+	(((u32)(p_src)[0]) | (((u32)(p_src)[1]) << 8) | \
+	(((u32)(p_src)[2]) << 16) | (((u32)(p_src)[3]) << 24))
+#define GET64(p_src) \
+	(((u64)(p_src)[0]) | (((u64)(p_src)[1]) << 8) | \
+	(((u64)(p_src)[2]) << 16) | (((u64)(p_src)[3]) << 24) | \
+	(((u64)(p_src)[4]) << 32) | (((u64)(p_src)[5]) << 40) | \
+	(((u64)(p_src)[6]) << 48) | (((u64)(p_src)[7]) << 56))
+
+
+#define SET16(p_dst, src)                                  \
+	do {                                              \
+		(p_dst)[0] = (u8)(src);                     \
+		(p_dst)[1] = (u8)(((u16)(src)) >> 8);       \
+	} while (0)
+#define SET32(p_dst, src)                                  \
+	do {                                              \
+		(p_dst)[0] = (u8)(src);                     \
+		(p_dst)[1] = (u8)(((u32)(src)) >> 8);       \
+		(p_dst)[2] = (u8)(((u32)(src)) >> 16);      \
+		(p_dst)[3] = (u8)(((u32)(src)) >> 24);      \
+	} while (0)
+#define SET64(p_dst, src)                                  \
+	do {                                              \
+		(p_dst)[0] = (u8)(src);                   \
+		(p_dst)[1] = (u8)(((u64)(src)) >> 8);     \
+		(p_dst)[2] = (u8)(((u64)(src)) >> 16);    \
+		(p_dst)[3] = (u8)(((u64)(src)) >> 24);    \
+		(p_dst)[4] = (u8)(((u64)(src)) >> 32);    \
+		(p_dst)[5] = (u8)(((u64)(src)) >> 40);    \
+		(p_dst)[6] = (u8)(((u64)(src)) >> 48);    \
+		(p_dst)[7] = (u8)(((u64)(src)) >> 56);    \
+	} while (0)
+
+#ifdef __LITTLE_ENDIAN
+#define GET16_A(p_src)		(*((u16 *)(p_src)))
+#define GET32_A(p_src)		(*((u32 *)(p_src)))
+#define GET64_A(p_src)		(*((u64 *)(p_src)))
+#define SET16_A(p_dst, src)	(*((u16 *)(p_dst)) = (u16)(src))
+#define SET32_A(p_dst, src)	(*((u32 *)(p_dst)) = (u32)(src))
+#define SET64_A(p_dst, src)	(*((u64 *)(p_dst)) = (u64)(src))
+#else /* BIG_ENDIAN */
+#define GET16_A(p_src)		GET16(p_src)
+#define GET32_A(p_src)		GET32(p_src)
+#define GET64_A(p_src)		GET64(p_src)
+#define SET16_A(p_dst, src)	SET16(p_dst, src)
+#define SET32_A(p_dst, src)	SET32(p_dst, src)
+#define SET64_A(p_dst, src)	SET64(p_dst, src)
+#endif
+
+/* Upcase tabel mecro */
+#define HIGH_INDEX_BIT (8)
+#define HIGH_INDEX_MASK (0xFF00)
+#define LOW_INDEX_BIT (16-HIGH_INDEX_BIT)
+#define UTBL_ROW_COUNT (1<<LOW_INDEX_BIT)
+#define UTBL_COL_COUNT (1<<HIGH_INDEX_BIT)
+
+#if CONFIG_EXFAT_DEBUG_MSG
+#define DPRINTK(...)			\
+	do {								\
+		printk("[EXFAT] " __VA_ARGS__);	\
+	} while (0)
+#else
+#define DPRINTK(...)
+#endif
+
+static inline u16 get_col_index(u16 i)
+{
+	return i >> LOW_INDEX_BIT;
+}
+static inline u16 get_row_index(u16 i)
+{
+	return i & ~HIGH_INDEX_MASK;
+}
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+/* MS_DOS FAT partition boot record (512 bytes) */
+typedef struct {
+	u8       jmp_boot[3];
+	u8       oem_name[8];
+	u8       bpb[109];
+	u8       boot_code[390];
+	u8       signature[2];
+} PBR_SECTOR_T;
+
+/* MS-DOS FAT12/16 BIOS parameter block (51 bytes) */
+typedef struct {
+	u8       sector_size[2];
+	u8       sectors_per_clu;
+	u8       num_reserved[2];
+	u8       num_fats;
+	u8       num_root_entries[2];
+	u8       num_sectors[2];
+	u8       media_type;
+	u8       num_fat_sectors[2];
+	u8       sectors_in_track[2];
+	u8       num_heads[2];
+	u8       num_hid_sectors[4];
+	u8       num_huge_sectors[4];
+
+	u8       phy_drv_no;
+	u8       reserved;
+	u8       ext_signature;
+	u8       vol_serial[4];
+	u8       vol_label[11];
+	u8       vol_type[8];
+} BPB16_T;
+
+/* MS-DOS FAT32 BIOS parameter block (79 bytes) */
+typedef struct {
+	u8       sector_size[2];
+	u8       sectors_per_clu;
+	u8       num_reserved[2];
+	u8       num_fats;
+	u8       num_root_entries[2];
+	u8       num_sectors[2];
+	u8       media_type;
+	u8       num_fat_sectors[2];
+	u8       sectors_in_track[2];
+	u8       num_heads[2];
+	u8       num_hid_sectors[4];
+	u8       num_huge_sectors[4];
+	u8       num_fat32_sectors[4];
+	u8       ext_flags[2];
+	u8       fs_version[2];
+	u8       root_cluster[4];
+	u8       fsinfo_sector[2];
+	u8       backup_sector[2];
+	u8       reserved[12];
+
+	u8       phy_drv_no;
+	u8       ext_reserved;
+	u8       ext_signature;
+	u8       vol_serial[4];
+	u8       vol_label[11];
+	u8       vol_type[8];
+} BPB32_T;
+
+/* MS-DOS EXFAT BIOS parameter block (109 bytes) */
+typedef struct {
+	u8       reserved1[53];
+	u8       vol_offset[8];
+	u8       vol_length[8];
+	u8       fat_offset[4];
+	u8       fat_length[4];
+	u8       clu_offset[4];
+	u8       clu_count[4];
+	u8       root_cluster[4];
+	u8       vol_serial[4];
+	u8       fs_version[2];
+	u8       vol_flags[2];
+	u8       sector_size_bits;
+	u8       sectors_per_clu_bits;
+	u8       num_fats;
+	u8       phy_drv_no;
+	u8       perc_in_use;
+	u8       reserved2[7];
+} BPBEX_T;
+
+/* MS-DOS FAT file system information sector (512 bytes) */
+typedef struct {
+	u8       signature1[4];
+	u8       reserved1[480];
+	u8       signature2[4];
+	u8       free_cluster[4];
+	u8       next_cluster[4];
+	u8       reserved2[14];
+	u8       signature3[2];
+} FSI_SECTOR_T;
+
+/* MS-DOS FAT directory entry (32 bytes) */
+typedef struct {
+	u8       dummy[32];
+} DENTRY_T;
+
+typedef struct {
+	u8       name[DOS_NAME_LENGTH];
+	u8       attr;
+	u8       lcase;
+	u8       create_time_ms;
+	u8       create_time[2];
+	u8       create_date[2];
+	u8       access_date[2];
+	u8       start_clu_hi[2];
+	u8       modify_time[2];
+	u8       modify_date[2];
+	u8       start_clu_lo[2];
+	u8       size[4];
+} DOS_DENTRY_T;
+
+/* MS-DOS FAT extended directory entry (32 bytes) */
+typedef struct {
+	u8       order;
+	u8       unicode_0_4[10];
+	u8       attr;
+	u8       sysid;
+	u8       checksum;
+	u8       unicode_5_10[12];
+	u8       start_clu[2];
+	u8       unicode_11_12[4];
+} EXT_DENTRY_T;
+
+/* MS-DOS EXFAT file directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       num_ext;
+	u8       checksum[2];
+	u8       attr[2];
+	u8       reserved1[2];
+	u8       create_time[2];
+	u8       create_date[2];
+	u8       modify_time[2];
+	u8       modify_date[2];
+	u8       access_time[2];
+	u8       access_date[2];
+	u8       create_time_ms;
+	u8       modify_time_ms;
+	u8       access_time_ms;
+	u8       reserved2[9];
+} FILE_DENTRY_T;
+
+/* MS-DOS EXFAT stream extension directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       flags;
+	u8       reserved1;
+	u8       name_len;
+	u8       name_hash[2];
+	u8       reserved2[2];
+	u8       valid_size[8];
+	u8       reserved3[4];
+	u8       start_clu[4];
+	u8       size[8];
+} STRM_DENTRY_T;
+
+/* MS-DOS EXFAT file name directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       flags;
+	u8       unicode_0_14[30];
+} NAME_DENTRY_T;
+
+/* MS-DOS EXFAT allocation bitmap directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       flags;
+	u8       reserved[18];
+	u8       start_clu[4];
+	u8       size[8];
+} BMAP_DENTRY_T;
+
+/* MS-DOS EXFAT up-case table directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       reserved1[3];
+	u8       checksum[4];
+	u8       reserved2[12];
+	u8       start_clu[4];
+	u8       size[8];
+} CASE_DENTRY_T;
+
+/* MS-DOS EXFAT volume label directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       label_len;
+	u8       unicode_0_10[22];
+	u8       reserved[8];
+} VOLM_DENTRY_T;
+
+/* unused entry hint information */
+typedef struct {
+	u32      dir;
+	s32       entry;
+	CHAIN_T     clu;
+} UENTRY_T;
+
+typedef struct {
+	s32       (*alloc_cluster)(struct super_block *sb, s32 num_alloc, CHAIN_T *p_chain);
+	void        (*free_cluster)(struct super_block *sb, CHAIN_T *p_chain, s32 do_relse);
+	s32       (*count_used_clusters)(struct super_block *sb);
+
+	s32      (*init_dir_entry)(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type,
+								 u32 start_clu, u64 size);
+	s32      (*init_ext_entry)(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 num_entries,
+								 UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname);
+	s32       (*find_dir_entry)(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 num_entries, DOS_NAME_T *p_dosname, u32 type);
+	void        (*delete_dir_entry)(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 offset, s32 num_entries);
+	void        (*get_uni_name_from_ext_entry)(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u16 *uniname);
+	s32       (*count_ext_entries)(struct super_block *sb, CHAIN_T *p_dir, s32 entry, DENTRY_T *p_entry);
+	s32       (*calc_num_entries)(UNI_NAME_T *p_uniname);
+
+	u32      (*get_entry_type)(DENTRY_T *p_entry);
+	void        (*set_entry_type)(DENTRY_T *p_entry, u32 type);
+	u32      (*get_entry_attr)(DENTRY_T *p_entry);
+	void        (*set_entry_attr)(DENTRY_T *p_entry, u32 attr);
+	u8       (*get_entry_flag)(DENTRY_T *p_entry);
+	void        (*set_entry_flag)(DENTRY_T *p_entry, u8 flag);
+	u32      (*get_entry_clu0)(DENTRY_T *p_entry);
+	void        (*set_entry_clu0)(DENTRY_T *p_entry, u32 clu0);
+	u64      (*get_entry_size)(DENTRY_T *p_entry);
+	void        (*set_entry_size)(DENTRY_T *p_entry, u64 size);
+	void        (*get_entry_time)(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+	void        (*set_entry_time)(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+} FS_FUNC_T;
+
+typedef struct __FS_INFO_T {
+	u32      drv;                    /* drive ID */
+	u32      vol_type;               /* volume FAT type */
+	u32      vol_id;                 /* volume serial number */
+
+	u32      num_sectors;            /* num of sectors in volume */
+	u32      num_clusters;           /* num of clusters in volume */
+	u32      cluster_size;           /* cluster size in bytes */
+	u32      cluster_size_bits;
+	u32      sectors_per_clu;        /* cluster size in sectors */
+	u32      sectors_per_clu_bits;
+
+	u32      PBR_sector;             /* PBR sector */
+	u32      FAT1_start_sector;      /* FAT1 start sector */
+	u32      FAT2_start_sector;      /* FAT2 start sector */
+	u32      root_start_sector;      /* root dir start sector */
+	u32      data_start_sector;      /* data area start sector */
+	u32      num_FAT_sectors;        /* num of FAT sectors */
+
+	u32      root_dir;               /* root dir cluster */
+	u32      dentries_in_root;       /* num of dentries in root dir */
+	u32      dentries_per_clu;       /* num of dentries per cluster */
+
+	u32      vol_flag;               /* volume dirty flag */
+	struct buffer_head *pbr_bh;         /* PBR sector */
+
+	u32      map_clu;                /* allocation bitmap start cluster */
+	u32      map_sectors;            /* num of allocation bitmap sectors */
+	struct buffer_head **vol_amap;      /* allocation bitmap */
+
+	u16      **vol_utbl;               /* upcase table */
+
+	u32      clu_srch_ptr;           /* cluster search pointer */
+	u32      used_clusters;          /* number of used clusters */
+	UENTRY_T    hint_uentry;         /* unused entry hint information */
+
+	u32      dev_ejected;            /* block device operation error flag */
+
+	FS_FUNC_T	*fs_func;
+	struct semaphore v_sem;
+
+	/* FAT cache */
+	BUF_CACHE_T FAT_cache_array[FAT_CACHE_SIZE];
+	BUF_CACHE_T FAT_cache_lru_list;
+	BUF_CACHE_T FAT_cache_hash_list[FAT_CACHE_HASH_SIZE];
+
+	/* buf cache */
+	BUF_CACHE_T buf_cache_array[BUF_CACHE_SIZE];
+	BUF_CACHE_T buf_cache_lru_list;
+	BUF_CACHE_T buf_cache_hash_list[BUF_CACHE_HASH_SIZE];
+} FS_INFO_T;
+
+#define ES_2_ENTRIES		2
+#define ES_3_ENTRIES		3
+#define ES_ALL_ENTRIES	0
+
+typedef struct {
+	u32	sector;		/* sector number that contains file_entry */
+	s32	offset;		/* byte offset in the sector */
+	s32	alloc_flag;	/* flag in stream entry. 01 for cluster chain, 03 for contig. clusteres. */
+	u32 num_entries;
+
+	/* __buf should be the last member */
+	void *__buf;
+} ENTRY_SET_CACHE_T;
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+/* file system initialization & shutdown functions */
+s32 ffsInit(void);
+s32 ffsShutdown(void);
+
+/* volume management functions */
+s32 ffsMountVol(struct super_block *sb);
+s32 ffsUmountVol(struct super_block *sb);
+s32 ffsCheckVol(struct super_block *sb);
+s32 ffsGetVolInfo(struct super_block *sb, VOL_INFO_T *info);
+s32 ffsSyncVol(struct super_block *sb, s32 do_sync);
+
+/* file management functions */
+s32 ffsLookupFile(struct inode *inode, char *path, FILE_ID_T *fid);
+s32 ffsCreateFile(struct inode *inode, char *path, u8 mode, FILE_ID_T *fid);
+s32 ffsReadFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *rcount);
+s32 ffsWriteFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *wcount);
+s32 ffsTruncateFile(struct inode *inode, u64 old_size, u64 new_size);
+s32 ffsMoveFile(struct inode *old_parent_inode, FILE_ID_T *fid, struct inode *new_parent_inode, struct dentry *new_dentry);
+s32 ffsRemoveFile(struct inode *inode, FILE_ID_T *fid);
+s32 ffsSetAttr(struct inode *inode, u32 attr);
+s32 ffsGetStat(struct inode *inode, DIR_ENTRY_T *info);
+s32 ffsSetStat(struct inode *inode, DIR_ENTRY_T *info);
+s32 ffsMapCluster(struct inode *inode, s32 clu_offset, u32 *clu);
+
+/* directory management functions */
+s32 ffsCreateDir(struct inode *inode, char *path, FILE_ID_T *fid);
+s32 ffsReadDir(struct inode *inode, DIR_ENTRY_T *dir_ent);
+s32 ffsRemoveDir(struct inode *inode, FILE_ID_T *fid);
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations (NOT TO UPPER LAYER)                 */
+/*----------------------------------------------------------------------*/
+
+/* fs management functions */
+s32  fs_init(void);
+s32  fs_shutdown(void);
+void   fs_set_vol_flags(struct super_block *sb, u32 new_flag);
+void   fs_sync(struct super_block *sb, s32 do_sync);
+void   fs_error(struct super_block *sb);
+
+/* cluster management functions */
+s32   clear_cluster(struct super_block *sb, u32 clu);
+s32  fat_alloc_cluster(struct super_block *sb, s32 num_alloc, CHAIN_T *p_chain);
+s32  exfat_alloc_cluster(struct super_block *sb, s32 num_alloc, CHAIN_T *p_chain);
+void   fat_free_cluster(struct super_block *sb, CHAIN_T *p_chain, s32 do_relse);
+void   exfat_free_cluster(struct super_block *sb, CHAIN_T *p_chain, s32 do_relse);
+u32 find_last_cluster(struct super_block *sb, CHAIN_T *p_chain);
+s32  count_num_clusters(struct super_block *sb, CHAIN_T *dir);
+s32  fat_count_used_clusters(struct super_block *sb);
+s32  exfat_count_used_clusters(struct super_block *sb);
+void   exfat_chain_cont_cluster(struct super_block *sb, u32 chain, s32 len);
+
+/* allocation bitmap management functions */
+s32  load_alloc_bitmap(struct super_block *sb);
+void   free_alloc_bitmap(struct super_block *sb);
+s32   set_alloc_bitmap(struct super_block *sb, u32 clu);
+s32   clr_alloc_bitmap(struct super_block *sb, u32 clu);
+u32 test_alloc_bitmap(struct super_block *sb, u32 clu);
+void   sync_alloc_bitmap(struct super_block *sb);
+
+/* upcase table management functions */
+s32  load_upcase_table(struct super_block *sb);
+void   free_upcase_table(struct super_block *sb);
+
+/* dir entry management functions */
+u32 fat_get_entry_type(DENTRY_T *p_entry);
+u32 exfat_get_entry_type(DENTRY_T *p_entry);
+void   fat_set_entry_type(DENTRY_T *p_entry, u32 type);
+void   exfat_set_entry_type(DENTRY_T *p_entry, u32 type);
+u32 fat_get_entry_attr(DENTRY_T *p_entry);
+u32 exfat_get_entry_attr(DENTRY_T *p_entry);
+void   fat_set_entry_attr(DENTRY_T *p_entry, u32 attr);
+void   exfat_set_entry_attr(DENTRY_T *p_entry, u32 attr);
+u8  fat_get_entry_flag(DENTRY_T *p_entry);
+u8  exfat_get_entry_flag(DENTRY_T *p_entry);
+void   fat_set_entry_flag(DENTRY_T *p_entry, u8 flag);
+void   exfat_set_entry_flag(DENTRY_T *p_entry, u8 flag);
+u32 fat_get_entry_clu0(DENTRY_T *p_entry);
+u32 exfat_get_entry_clu0(DENTRY_T *p_entry);
+void   fat_set_entry_clu0(DENTRY_T *p_entry, u32 start_clu);
+void   exfat_set_entry_clu0(DENTRY_T *p_entry, u32 start_clu);
+u64 fat_get_entry_size(DENTRY_T *p_entry);
+u64 exfat_get_entry_size(DENTRY_T *p_entry);
+void   fat_set_entry_size(DENTRY_T *p_entry, u64 size);
+void   exfat_set_entry_size(DENTRY_T *p_entry, u64 size);
+void   fat_get_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+void   exfat_get_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+void   fat_set_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+void   exfat_set_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+s32   fat_init_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type, u32 start_clu, u64 size);
+s32   exfat_init_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type, u32 start_clu, u64 size);
+s32   fat_init_ext_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 num_entries, UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname);
+s32   exfat_init_ext_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 num_entries, UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname);
+void   init_dos_entry(DOS_DENTRY_T *ep, u32 type, u32 start_clu);
+void   init_ext_entry(EXT_DENTRY_T *ep, s32 order, u8 chksum, u16 *uniname);
+void   init_file_entry(FILE_DENTRY_T *ep, u32 type);
+void   init_strm_entry(STRM_DENTRY_T *ep, u8 flags, u32 start_clu, u64 size);
+void   init_name_entry(NAME_DENTRY_T *ep, u16 *uniname);
+void   fat_delete_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 order, s32 num_entries);
+void   exfat_delete_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 order, s32 num_entries);
+
+s32   find_location(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 *sector, s32 *offset);
+DENTRY_T *get_entry_with_sector(struct super_block *sb, u32 sector, s32 offset);
+DENTRY_T *get_entry_in_dir(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 *sector);
+ENTRY_SET_CACHE_T *get_entry_set_in_dir(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type, DENTRY_T **file_ep);
+void release_entry_set(ENTRY_SET_CACHE_T *es);
+s32 write_whole_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es);
+s32 write_partial_entries_in_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es, DENTRY_T *ep, u32 count);
+s32  search_deleted_or_unused_entry(struct super_block *sb, CHAIN_T *p_dir, s32 num_entries);
+s32  find_empty_entry(struct inode *inode, CHAIN_T *p_dir, s32 num_entries);
+s32  fat_find_dir_entry(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 num_entries, DOS_NAME_T *p_dosname, u32 type);
+s32  exfat_find_dir_entry(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 num_entries, DOS_NAME_T *p_dosname, u32 type);
+s32  fat_count_ext_entries(struct super_block *sb, CHAIN_T *p_dir, s32 entry, DENTRY_T *p_entry);
+s32  exfat_count_ext_entries(struct super_block *sb, CHAIN_T *p_dir, s32 entry, DENTRY_T *p_entry);
+s32  count_dos_name_entries(struct super_block *sb, CHAIN_T *p_dir, u32 type);
+void   update_dir_checksum(struct super_block *sb, CHAIN_T *p_dir, s32 entry);
+void update_dir_checksum_with_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es);
+bool   is_dir_empty(struct super_block *sb, CHAIN_T *p_dir);
+
+/* name conversion functions */
+s32  get_num_entries_and_dos_name(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 *entries, DOS_NAME_T *p_dosname);
+void   get_uni_name_from_dos_entry(struct super_block *sb, DOS_DENTRY_T *ep, UNI_NAME_T *p_uniname, u8 mode);
+void   fat_get_uni_name_from_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u16 *uniname);
+void   exfat_get_uni_name_from_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u16 *uniname);
+s32  extract_uni_name_from_ext_entry(EXT_DENTRY_T *ep, u16 *uniname, s32 order);
+s32  extract_uni_name_from_name_entry(NAME_DENTRY_T *ep, u16 *uniname, s32 order);
+s32  fat_generate_dos_name(struct super_block *sb, CHAIN_T *p_dir, DOS_NAME_T *p_dosname);
+void   fat_attach_count_to_dos_name(u8 *dosname, s32 count);
+s32  fat_calc_num_entries(UNI_NAME_T *p_uniname);
+s32  exfat_calc_num_entries(UNI_NAME_T *p_uniname);
+u8  calc_checksum_1byte(void *data, s32 len, u8 chksum);
+u16 calc_checksum_2byte(void *data, s32 len, u16 chksum, s32 type);
+u32 calc_checksum_4byte(void *data, s32 len, u32 chksum, s32 type);
+
+/* name resolution functions */
+s32  resolve_path(struct inode *inode, char *path, CHAIN_T *p_dir, UNI_NAME_T *p_uniname);
+s32  resolve_name(u8 *name, u8 **arg);
+
+/* file operation functions */
+s32  fat16_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr);
+s32  fat32_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr);
+s32  exfat_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr);
+s32  create_dir(struct inode *inode, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, FILE_ID_T *fid);
+s32  create_file(struct inode *inode, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, u8 mode, FILE_ID_T *fid);
+void   remove_file(struct inode *inode, CHAIN_T *p_dir, s32 entry);
+s32  rename_file(struct inode *inode, CHAIN_T *p_dir, s32 old_entry, UNI_NAME_T *p_uniname, FILE_ID_T *fid);
+s32  move_file(struct inode *inode, CHAIN_T *p_olddir, s32 oldentry, CHAIN_T *p_newdir, UNI_NAME_T *p_uniname, FILE_ID_T *fid);
+
+/* sector read/write functions */
+s32   sector_read(struct super_block *sb, u32 sec, struct buffer_head **bh, s32 read);
+s32   sector_write(struct super_block *sb, u32 sec, struct buffer_head *bh, s32 sync);
+s32   multi_sector_read(struct super_block *sb, u32 sec, struct buffer_head **bh, s32 num_secs, s32 read);
+s32   multi_sector_write(struct super_block *sb, u32 sec, struct buffer_head *bh, s32 num_secs, s32 sync);
+
+#endif /* _EXFAT_H */
diff --git b/fs/exfat/exfat_data.c b/fs/exfat/exfat_data.c
new file mode 100644
index 0000000..65da07a
--- /dev/null
+++ b/fs/exfat/exfat_data.c
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_data.c                                              */
+/*  PURPOSE : exFAT Configuable Data Definitions                        */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+
+#include "exfat_blkdev.h"
+#include "exfat_cache.h"
+#include "exfat_nls.h"
+#include "exfat_super.h"
+#include "exfat_core.h"
+
+/*======================================================================*/
+/*                                                                      */
+/*                    GLOBAL VARIABLE DEFINITIONS                       */
+/*                                                                      */
+/*======================================================================*/
+
+/*----------------------------------------------------------------------*/
+/*  File Manager                                                        */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Buffer Manager                                                      */
+/*----------------------------------------------------------------------*/
+
+/* FAT cache */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+DECLARE_MUTEX(f_sem);
+#else
+DEFINE_SEMAPHORE(f_sem);
+#endif
+BUF_CACHE_T FAT_cache_array[FAT_CACHE_SIZE];
+BUF_CACHE_T FAT_cache_lru_list;
+BUF_CACHE_T FAT_cache_hash_list[FAT_CACHE_HASH_SIZE];
+
+/* buf cache */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+DECLARE_MUTEX(b_sem);
+#else
+DEFINE_SEMAPHORE(b_sem);
+#endif
+BUF_CACHE_T buf_cache_array[BUF_CACHE_SIZE];
+BUF_CACHE_T buf_cache_lru_list;
+BUF_CACHE_T buf_cache_hash_list[BUF_CACHE_HASH_SIZE];
diff --git b/fs/exfat/exfat_data.h b/fs/exfat/exfat_data.h
new file mode 100644
index 0000000..53b0e39
--- /dev/null
+++ b/fs/exfat/exfat_data.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_data.h                                              */
+/*  PURPOSE : Header File for exFAT Configuable Constants               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_DATA_H
+#define _EXFAT_DATA_H
+
+#include "exfat_config.h"
+
+/*======================================================================*/
+/*                                                                      */
+/*                        FFS CONFIGURATIONS                            */
+/*                  (CHANGE THIS PART IF REQUIRED)                      */
+/*                                                                      */
+/*======================================================================*/
+
+/* max number of root directory entries in FAT12/16 */
+/* (should be an exponential value of 2)            */
+#define MAX_DENTRY              512
+
+/* cache size (in number of sectors)                */
+/* (should be an exponential value of 2)            */
+#define FAT_CACHE_SIZE          128
+#define FAT_CACHE_HASH_SIZE     64
+#define BUF_CACHE_SIZE          256
+#define BUF_CACHE_HASH_SIZE     64
+
+#endif /* _EXFAT_DATA_H */
diff --git b/fs/exfat/exfat_nls.c b/fs/exfat/exfat_nls.c
new file mode 100644
index 0000000..a48b3d0
--- /dev/null
+++ b/fs/exfat/exfat_nls.c
@@ -0,0 +1,448 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_nls.c                                               */
+/*  PURPOSE : exFAT NLS Manager                                         */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include "exfat_config.h"
+#include "exfat_data.h"
+
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_super.h"
+#include "exfat_core.h"
+
+#include <linux/nls.h>
+
+/*----------------------------------------------------------------------*/
+/*  Global Variable Definitions                                         */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Local Variable Definitions                                          */
+/*----------------------------------------------------------------------*/
+
+static u16 bad_dos_chars[] = {
+	/* + , ; = [ ] */
+	0x002B, 0x002C, 0x003B, 0x003D, 0x005B, 0x005D,
+	0xFF0B, 0xFF0C, 0xFF1B, 0xFF1D, 0xFF3B, 0xFF3D,
+	0
+};
+
+static u16 bad_uni_chars[] = {
+	/* " * / : < > ? \ | */
+	0x0022,         0x002A, 0x002F, 0x003A,
+	0x003C, 0x003E, 0x003F, 0x005C, 0x007C,
+	0
+};
+
+/*----------------------------------------------------------------------*/
+/*  Local Function Declarations                                         */
+/*----------------------------------------------------------------------*/
+
+static s32  convert_uni_to_ch(struct nls_table *nls, u8 *ch, u16 uni, s32 *lossy);
+static s32  convert_ch_to_uni(struct nls_table *nls, u16 *uni, u8 *ch, s32 *lossy);
+
+/*======================================================================*/
+/*  Global Function Definitions                                         */
+/*======================================================================*/
+
+u16 nls_upper(struct super_block *sb, u16 a)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (EXFAT_SB(sb)->options.casesensitive)
+		return a;
+	if (p_fs->vol_utbl != NULL && (p_fs->vol_utbl)[get_col_index(a)] != NULL)
+		return (p_fs->vol_utbl)[get_col_index(a)][get_row_index(a)];
+	else
+		return a;
+}
+
+u16 *nls_wstrchr(u16 *str, u16 wchar)
+{
+	while (*str) {
+		if (*(str++) == wchar)
+			return str;
+	}
+
+	return 0;
+}
+
+s32 nls_dosname_cmp(struct super_block *sb, u8 *a, u8 *b)
+{
+	return strncmp((void *) a, (void *) b, DOS_NAME_LENGTH);
+} /* end of nls_dosname_cmp */
+
+s32 nls_uniname_cmp(struct super_block *sb, u16 *a, u16 *b)
+{
+	int i;
+
+	for (i = 0; i < MAX_NAME_LENGTH; i++, a++, b++) {
+		if (nls_upper(sb, *a) != nls_upper(sb, *b))
+			return 1;
+		if (*a == 0x0)
+			return 0;
+	}
+	return 0;
+} /* end of nls_uniname_cmp */
+
+void nls_uniname_to_dosname(struct super_block *sb, DOS_NAME_T *p_dosname, UNI_NAME_T *p_uniname, s32 *p_lossy)
+{
+	int i, j, len, lossy = FALSE;
+	u8 buf[MAX_CHARSET_SIZE];
+	u8 lower = 0, upper = 0;
+	u8 *dosname = p_dosname->name;
+	u16 *uniname = p_uniname->name;
+	u16 *p, *last_period;
+	struct nls_table *nls = EXFAT_SB(sb)->nls_disk;
+
+	for (i = 0; i < DOS_NAME_LENGTH; i++)
+		*(dosname+i) = ' ';
+
+	if (!nls_uniname_cmp(sb, uniname, (u16 *) UNI_CUR_DIR_NAME)) {
+		*(dosname) = '.';
+		p_dosname->name_case = 0x0;
+		if (p_lossy != NULL)
+			*p_lossy = FALSE;
+		return;
+	}
+
+	if (!nls_uniname_cmp(sb, uniname, (u16 *) UNI_PAR_DIR_NAME)) {
+		*(dosname) = '.';
+		*(dosname+1) = '.';
+		p_dosname->name_case = 0x0;
+		if (p_lossy != NULL)
+			*p_lossy = FALSE;
+		return;
+	}
+
+	/* search for the last embedded period */
+	last_period = NULL;
+	for (p = uniname; *p; p++) {
+		if (*p == (u16) '.')
+			last_period = p;
+	}
+
+	i = 0;
+	while (i < DOS_NAME_LENGTH) {
+		if (i == 8) {
+			if (last_period == NULL)
+				break;
+
+			if (uniname <= last_period) {
+				if (uniname < last_period)
+					lossy = TRUE;
+				uniname = last_period + 1;
+			}
+		}
+
+		if (*uniname == (u16) '\0') {
+			break;
+		} else if (*uniname == (u16) ' ') {
+			lossy = TRUE;
+		} else if (*uniname == (u16) '.') {
+			if (uniname < last_period)
+				lossy = TRUE;
+			else
+				i = 8;
+		} else if (nls_wstrchr(bad_dos_chars, *uniname)) {
+			lossy = TRUE;
+			*(dosname+i) = '_';
+			i++;
+		} else {
+			len = convert_uni_to_ch(nls, buf, *uniname, &lossy);
+
+			if (len > 1) {
+				if ((i >= 8) && ((i+len) > DOS_NAME_LENGTH))
+					break;
+
+				if ((i <  8) && ((i+len) > 8)) {
+					i = 8;
+					continue;
+				}
+
+				lower = 0xFF;
+
+				for (j = 0; j < len; j++, i++)
+					*(dosname+i) = *(buf+j);
+			} else { /* len == 1 */
+				if ((*buf >= 'a') && (*buf <= 'z')) {
+					*(dosname+i) = *buf - ('a' - 'A');
+
+					if (i < 8)
+						lower |= 0x08;
+					else
+						lower |= 0x10;
+				} else if ((*buf >= 'A') && (*buf <= 'Z')) {
+					*(dosname+i) = *buf;
+
+					if (i < 8)
+						upper |= 0x08;
+					else
+						upper |= 0x10;
+				} else {
+					*(dosname+i) = *buf;
+				}
+				i++;
+			}
+		}
+
+		uniname++;
+	}
+
+	if (*dosname == 0xE5)
+		*dosname = 0x05;
+
+	if (*uniname != 0x0)
+		lossy = TRUE;
+
+	if (upper & lower)
+		p_dosname->name_case = 0xFF;
+	else
+		p_dosname->name_case = lower;
+
+	if (p_lossy != NULL)
+		*p_lossy = lossy;
+} /* end of nls_uniname_to_dosname */
+
+void nls_dosname_to_uniname(struct super_block *sb, UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname)
+{
+	int i = 0, j, n = 0;
+	u8 buf[DOS_NAME_LENGTH+2];
+	u8 *dosname = p_dosname->name;
+	u16 *uniname = p_uniname->name;
+	struct nls_table *nls = EXFAT_SB(sb)->nls_disk;
+
+	if (*dosname == 0x05) {
+		*buf = 0xE5;
+		i++;
+		n++;
+	}
+
+	for (; i < 8; i++, n++) {
+		if (*(dosname+i) == ' ')
+			break;
+
+		if ((*(dosname+i) >= 'A') && (*(dosname+i) <= 'Z') && (p_dosname->name_case & 0x08))
+			*(buf+n) = *(dosname+i) + ('a' - 'A');
+		else
+			*(buf+n) = *(dosname+i);
+	}
+	if (*(dosname+8) != ' ') {
+		*(buf+n) = '.';
+		n++;
+	}
+
+	for (i = 8; i < DOS_NAME_LENGTH; i++, n++) {
+		if (*(dosname+i) == ' ')
+			break;
+
+		if ((*(dosname+i) >= 'A') && (*(dosname+i) <= 'Z') && (p_dosname->name_case & 0x10))
+			*(buf+n) = *(dosname+i) + ('a' - 'A');
+		else
+			*(buf+n) = *(dosname+i);
+	}
+	*(buf+n) = '\0';
+
+	i = j = 0;
+	while (j < (MAX_NAME_LENGTH-1)) {
+		if (*(buf+i) == '\0')
+			break;
+
+		i += convert_ch_to_uni(nls, uniname, (buf+i), NULL);
+
+		uniname++;
+		j++;
+	}
+
+	*uniname = (u16) '\0';
+} /* end of nls_dosname_to_uniname */
+
+void nls_uniname_to_cstring(struct super_block *sb, u8 *p_cstring, UNI_NAME_T *p_uniname)
+{
+	int i, j, len;
+	u8 buf[MAX_CHARSET_SIZE];
+	u16 *uniname = p_uniname->name;
+	struct nls_table *nls = EXFAT_SB(sb)->nls_io;
+
+	if (nls == NULL) {
+		len = utf16s_to_utf8s(uniname, MAX_NAME_LENGTH, UTF16_HOST_ENDIAN, p_cstring, MAX_NAME_LENGTH);
+		p_cstring[len] = 0;
+		return;
+	}
+
+	i = 0;
+	while (i < (MAX_NAME_LENGTH-1)) {
+		if (*uniname == (u16) '\0')
+			break;
+
+		len = convert_uni_to_ch(nls, buf, *uniname, NULL);
+
+		if (len > 1) {
+			for (j = 0; j < len; j++)
+				*p_cstring++ = (char) *(buf+j);
+		} else { /* len == 1 */
+			*p_cstring++ = (char) *buf;
+		}
+
+		uniname++;
+		i++;
+	}
+
+	*p_cstring = '\0';
+} /* end of nls_uniname_to_cstring */
+
+void nls_cstring_to_uniname(struct super_block *sb, UNI_NAME_T *p_uniname, u8 *p_cstring, s32 *p_lossy)
+{
+	int i, j, lossy = FALSE;
+	u8 *end_of_name;
+	u8 upname[MAX_NAME_LENGTH * 2];
+	u16 *uniname = p_uniname->name;
+	struct nls_table *nls = EXFAT_SB(sb)->nls_io;
+
+
+	/* strip all trailing spaces */
+	end_of_name = p_cstring + strlen((char *) p_cstring);
+
+	while (*(--end_of_name) == ' ') {
+		if (end_of_name < p_cstring)
+			break;
+	}
+	*(++end_of_name) = '\0';
+
+	if (strcmp((char *) p_cstring, ".") && strcmp((char *) p_cstring, "..")) {
+
+		/* strip all trailing periods */
+		while (*(--end_of_name) == '.') {
+			if (end_of_name < p_cstring)
+				break;
+		}
+		*(++end_of_name) = '\0';
+	}
+
+	if (*p_cstring == '\0')
+		lossy = TRUE;
+
+	if (nls == NULL) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,101)
+		i = utf8s_to_utf16s(p_cstring, MAX_NAME_LENGTH, uniname);
+#else
+		i = utf8s_to_utf16s(p_cstring, MAX_NAME_LENGTH, UTF16_HOST_ENDIAN, uniname, MAX_NAME_LENGTH);
+#endif
+		for (j = 0; j < i; j++)
+			SET16_A(upname + j * 2, nls_upper(sb, uniname[j]));
+		uniname[i] = '\0';
+	}
+	else {
+		i = j = 0;
+		while (j < (MAX_NAME_LENGTH-1)) {
+			if (*(p_cstring+i) == '\0')
+				break;
+
+			i += convert_ch_to_uni(nls, uniname, (u8 *)(p_cstring+i), &lossy);
+
+			if ((*uniname < 0x0020) || nls_wstrchr(bad_uni_chars, *uniname))
+				lossy = TRUE;
+
+			SET16_A(upname + j * 2, nls_upper(sb, *uniname));
+
+			uniname++;
+			j++;
+		}
+
+		if (*(p_cstring+i) != '\0')
+			lossy = TRUE;
+		*uniname = (u16) '\0';
+	}
+
+	p_uniname->name_len = j;
+	p_uniname->name_hash = calc_checksum_2byte((void *) upname, j<<1, 0, CS_DEFAULT);
+
+	if (p_lossy != NULL)
+		*p_lossy = lossy;
+} /* end of nls_cstring_to_uniname */
+
+/*======================================================================*/
+/*  Local Function Definitions                                          */
+/*======================================================================*/
+
+static s32 convert_ch_to_uni(struct nls_table *nls, u16 *uni, u8 *ch, s32 *lossy)
+{
+	int len;
+
+	*uni = 0x0;
+
+	if (ch[0] < 0x80) {
+		*uni = (u16) ch[0];
+		return 1;
+	}
+
+	len = nls->char2uni(ch, NLS_MAX_CHARSET_SIZE, uni);
+	if (len < 0) {
+		/* conversion failed */
+		printk("%s: fail to use nls\n", __func__);
+		if (lossy != NULL)
+			*lossy = TRUE;
+		*uni = (u16) '_';
+		if (!strcmp(nls->charset, "utf8"))
+			return 1;
+		else
+			return 2;
+	}
+
+	return len;
+} /* end of convert_ch_to_uni */
+
+static s32 convert_uni_to_ch(struct nls_table *nls, u8 *ch, u16 uni, s32 *lossy)
+{
+	int len;
+
+	ch[0] = 0x0;
+
+	if (uni < 0x0080) {
+		ch[0] = (u8) uni;
+		return 1;
+	}
+
+	len = nls->uni2char(uni, ch, NLS_MAX_CHARSET_SIZE);
+	if (len < 0) {
+		/* conversion failed */
+		printk("%s: fail to use nls\n", __func__);
+		if (lossy != NULL)
+			*lossy = TRUE;
+		ch[0] = '_';
+		return 1;
+	}
+
+	return len;
+
+} /* end of convert_uni_to_ch */
diff --git b/fs/exfat/exfat_nls.h b/fs/exfat/exfat_nls.h
new file mode 100644
index 0000000..bc516d7
--- /dev/null
+++ b/fs/exfat/exfat_nls.h
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_nls.h                                               */
+/*  PURPOSE : Header File for exFAT NLS Manager                         */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_NLS_H
+#define _EXFAT_NLS_H
+
+#include <linux/types.h>
+#include <linux/nls.h>
+
+#include "exfat_config.h"
+#include "exfat_api.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions                                        */
+/*----------------------------------------------------------------------*/
+
+#define NUM_UPCASE              2918
+
+#define DOS_CUR_DIR_NAME        ".          "
+#define DOS_PAR_DIR_NAME        "..         "
+
+#ifdef __LITTLE_ENDIAN
+#define UNI_CUR_DIR_NAME        ".\0"
+#define UNI_PAR_DIR_NAME        ".\0.\0"
+#else
+#define UNI_CUR_DIR_NAME        "\0."
+#define UNI_PAR_DIR_NAME        "\0.\0."
+#endif
+
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+/* DOS name stucture */
+typedef struct {
+	u8       name[DOS_NAME_LENGTH];
+	u8       name_case;
+} DOS_NAME_T;
+
+/* unicode name stucture */
+typedef struct {
+	u16      name[MAX_NAME_LENGTH];
+	u16      name_hash;
+	u8       name_len;
+} UNI_NAME_T;
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+/* NLS management function */
+u16 nls_upper(struct super_block *sb, u16 a);
+s32  nls_dosname_cmp(struct super_block *sb, u8 *a, u8 *b);
+s32  nls_uniname_cmp(struct super_block *sb, u16 *a, u16 *b);
+void   nls_uniname_to_dosname(struct super_block *sb, DOS_NAME_T *p_dosname, UNI_NAME_T *p_uniname, s32 *p_lossy);
+void   nls_dosname_to_uniname(struct super_block *sb, UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname);
+void   nls_uniname_to_cstring(struct super_block *sb, u8 *p_cstring, UNI_NAME_T *p_uniname);
+void   nls_cstring_to_uniname(struct super_block *sb, UNI_NAME_T *p_uniname, u8 *p_cstring, s32 *p_lossy);
+
+#endif /* _EXFAT_NLS_H */
diff --git b/fs/exfat/exfat_oal.c b/fs/exfat/exfat_oal.c
new file mode 100644
index 0000000..9b33998
--- /dev/null
+++ b/fs/exfat/exfat_oal.c
@@ -0,0 +1,190 @@
+/* Some of the source code in this file came from "linux/fs/fat/misc.c".  */
+/*
+ *  linux/fs/fat/misc.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980
+ *         and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
+ */
+
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_oal.c                                               */
+/*  PURPOSE : exFAT OS Adaptation Layer                                 */
+/*            (Semaphore Functions & Real-Time Clock Functions)         */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include <linux/semaphore.h>
+#include <linux/time.h>
+
+#include "exfat_config.h"
+#include "exfat_api.h"
+#include "exfat_oal.h"
+
+/*======================================================================*/
+/*                                                                      */
+/*            SEMAPHORE FUNCTIONS                                       */
+/*                                                                      */
+/*======================================================================*/
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+DECLARE_MUTEX(z_sem);
+#else
+DEFINE_SEMAPHORE(z_sem);
+#endif
+
+s32 sm_init(struct semaphore *sm)
+{
+	sema_init(sm, 1);
+	return 0;
+} /* end of sm_init */
+
+s32 sm_P(struct semaphore *sm)
+{
+	down(sm);
+	return 0;
+} /* end of sm_P */
+
+void sm_V(struct semaphore *sm)
+{
+	up(sm);
+} /* end of sm_V */
+
+
+/*======================================================================*/
+/*                                                                      */
+/*            REAL-TIME CLOCK FUNCTIONS                                 */
+/*                                                                      */
+/*======================================================================*/
+
+extern struct timezone sys_tz;
+
+/*
+ * The epoch of FAT timestamp is 1980.
+ *     :  bits  : value
+ * date:  0 -  4: day    (1 -  31)
+ * date:  5 -  8: month  (1 -  12)
+ * date:  9 - 15: year   (0 - 127) from 1980
+ * time:  0 -  4: sec    (0 -  29) 2sec counts
+ * time:  5 - 10: min    (0 -  59)
+ * time: 11 - 15: hour   (0 -  23)
+ */
+#define UNIX_SECS_1980   315532800L
+
+#if BITS_PER_LONG == 64
+#define UNIX_SECS_2108   4354819200L
+#endif
+/* days between 1.1.70 and 1.1.80 (2 leap days) */
+#define DAYS_DELTA_DECADE	(365 * 10 + 2)
+/* 120 (2100 - 1980) isn't leap year */
+#define NO_LEAP_YEAR_2100    (120)
+#define IS_LEAP_YEAR(y)  (!((y) & 3) && (y) != NO_LEAP_YEAR_2100)
+
+#define SECS_PER_MIN     (60)
+#define SECS_PER_HOUR    (60 * SECS_PER_MIN)
+#define SECS_PER_DAY     (24 * SECS_PER_HOUR)
+
+#define MAKE_LEAP_YEAR(leap_year, year)                         \
+	do {                                                    \
+		if (unlikely(year > NO_LEAP_YEAR_2100))         \
+			leap_year = ((year + 3) / 4) - 1;       \
+		else                                            \
+			leap_year = ((year + 3) / 4);           \
+	} while (0)
+
+/* Linear day numbers of the respective 1sts in non-leap years. */
+static time_t accum_days_in_year[] = {
+	/* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
+	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
+};
+
+TIMESTAMP_T *tm_current(TIMESTAMP_T *tp)
+{
+	struct timespec ts = CURRENT_TIME_SEC;
+	time_t second = ts.tv_sec;
+	time_t day, leap_day, month, year;
+
+	second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
+
+	/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
+	if (second < UNIX_SECS_1980) {
+		tp->sec  = 0;
+		tp->min  = 0;
+		tp->hour = 0;
+		tp->day  = 1;
+		tp->mon  = 1;
+		tp->year = 0;
+		return tp;
+	}
+#if BITS_PER_LONG == 64
+	if (second >= UNIX_SECS_2108) {
+		tp->sec  = 59;
+		tp->min  = 59;
+		tp->hour = 23;
+		tp->day  = 31;
+		tp->mon  = 12;
+		tp->year = 127;
+		return tp;
+	}
+#endif
+
+	day = second / SECS_PER_DAY - DAYS_DELTA_DECADE;
+	year = day / 365;
+
+	MAKE_LEAP_YEAR(leap_day, year);
+	if (year * 365 + leap_day > day)
+		year--;
+
+	MAKE_LEAP_YEAR(leap_day, year);
+
+	day -= year * 365 + leap_day;
+
+	if (IS_LEAP_YEAR(year) && day == accum_days_in_year[3]) {
+		month = 2;
+	} else {
+		if (IS_LEAP_YEAR(year) && day > accum_days_in_year[3])
+			day--;
+		for (month = 1; month < 12; month++) {
+			if (accum_days_in_year[month + 1] > day)
+				break;
+		}
+	}
+	day -= accum_days_in_year[month];
+
+	tp->sec  = second % SECS_PER_MIN;
+	tp->min  = (second / SECS_PER_MIN) % 60;
+	tp->hour = (second / SECS_PER_HOUR) % 24;
+	tp->day  = day + 1;
+	tp->mon  = month;
+	tp->year = year;
+
+	return tp;
+} /* end of tm_current */
diff --git b/fs/exfat/exfat_oal.h b/fs/exfat/exfat_oal.h
new file mode 100644
index 0000000..b6dd789
--- /dev/null
+++ b/fs/exfat/exfat_oal.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_oal.h                                               */
+/*  PURPOSE : Header File for exFAT OS Adaptation Layer                 */
+/*            (Semaphore Functions & Real-Time Clock Functions)         */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_OAL_H
+#define _EXFAT_OAL_H
+
+#include <linux/semaphore.h>
+#include "exfat_config.h"
+#include <linux/version.h>
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions (Configurable)                         */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions (Non-Configurable)                     */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+typedef struct {
+	u16      sec;        /* 0 ~ 59               */
+	u16      min;        /* 0 ~ 59               */
+	u16      hour;       /* 0 ~ 23               */
+	u16      day;        /* 1 ~ 31               */
+	u16      mon;        /* 1 ~ 12               */
+	u16      year;       /* 0 ~ 127 (since 1980) */
+} TIMESTAMP_T;
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+s32 sm_init(struct semaphore *sm);
+s32 sm_P(struct semaphore *sm);
+void  sm_V(struct semaphore *sm);
+
+TIMESTAMP_T *tm_current(TIMESTAMP_T *tm);
+
+#endif /* _EXFAT_OAL_H */
diff --git b/fs/exfat/exfat_super.c b/fs/exfat/exfat_super.c
new file mode 100644
index 0000000..02d4fd2
--- /dev/null
+++ b/fs/exfat/exfat_super.c
@@ -0,0 +1,2617 @@
+/* Some of the source code in this file came from "linux/fs/fat/file.c","linux/fs/fat/inode.c" and "linux/fs/fat/misc.c".  */
+/*
+ *  linux/fs/fat/file.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  regular file handling primitives for fat-based filesystems
+ */
+
+/*
+ *  linux/fs/fat/inode.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  VFAT extensions by Gordon Chaffee, merged with msdos fs by Henrik Storner
+ *  Rewritten for the constant inumbers support by Al Viro
+ *
+ *  Fixes:
+ *
+ *    Max Cohan: Fixed invalid FSINFO offset when info_sector is 0
+ */
+
+/*
+ *  linux/fs/fat/misc.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980
+ *         and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
+ */
+
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+#include <linux/smp_lock.h>
+#endif
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/mpage.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+#include <linux/aio.h>
+#endif
+#include <linux/parser.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include <linux/log2.h>
+#include <linux/hash.h>
+#include <linux/backing-dev.h>
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <asm/current.h>
+#include <asm/unaligned.h>
+
+#include "exfat_version.h"
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+
+#include "exfat_blkdev.h"
+#include "exfat_cache.h"
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_core.h"
+
+#include "exfat_super.h"
+
+static struct kmem_cache *exfat_inode_cachep;
+
+static int exfat_default_codepage = CONFIG_EXFAT_DEFAULT_CODEPAGE;
+static char exfat_default_iocharset[] = CONFIG_EXFAT_DEFAULT_IOCHARSET;
+
+extern struct timezone sys_tz;
+
+#define CHECK_ERR(x)	BUG_ON(x)
+
+#define UNIX_SECS_1980    315532800L
+
+#if BITS_PER_LONG == 64
+#define UNIX_SECS_2108    4354819200L
+#endif
+/* days between 1.1.70 and 1.1.80 (2 leap days) */
+#define DAYS_DELTA_DECADE    (365 * 10 + 2)
+/* 120 (2100 - 1980) isn't leap year */
+#define NO_LEAP_YEAR_2100    (120)
+#define IS_LEAP_YEAR(y)    (!((y) & 0x3) && (y) != NO_LEAP_YEAR_2100)
+
+#define SECS_PER_MIN    (60)
+#define SECS_PER_HOUR   (60 * SECS_PER_MIN)
+#define SECS_PER_DAY    (24 * SECS_PER_HOUR)
+
+#define MAKE_LEAP_YEAR(leap_year, year)                         \
+	do {                                                    \
+		if (unlikely(year > NO_LEAP_YEAR_2100))         \
+			leap_year = ((year + 3) / 4) - 1;       \
+		else                                            \
+			leap_year = ((year + 3) / 4);           \
+	} while (0)
+
+/* Linear day numbers of the respective 1sts in non-leap years. */
+static time_t accum_days_in_year[] = {
+	/* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
+	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
+};
+
+static void _exfat_truncate(struct inode *inode, loff_t old_size);
+
+/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
+void exfat_time_fat2unix(struct exfat_sb_info *sbi, struct timespec *ts,
+						 DATE_TIME_T *tp)
+{
+	time_t year = tp->Year;
+	time_t ld;
+
+	MAKE_LEAP_YEAR(ld, year);
+
+	if (IS_LEAP_YEAR(year) && (tp->Month) > 2)
+		ld++;
+
+	ts->tv_sec =  tp->Second  + tp->Minute * SECS_PER_MIN
+				  + tp->Hour * SECS_PER_HOUR
+				  + (year * 365 + ld + accum_days_in_year[(tp->Month)] + (tp->Day - 1) + DAYS_DELTA_DECADE) * SECS_PER_DAY
+				  + sys_tz.tz_minuteswest * SECS_PER_MIN;
+	ts->tv_nsec = 0;
+}
+
+/* Convert linear UNIX date to a FAT time/date pair. */
+void exfat_time_unix2fat(struct exfat_sb_info *sbi, struct timespec *ts,
+						 DATE_TIME_T *tp)
+{
+	time_t second = ts->tv_sec;
+	time_t day, month, year;
+	time_t ld;
+
+	second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
+
+	/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
+	if (second < UNIX_SECS_1980) {
+		tp->Second  = 0;
+		tp->Minute  = 0;
+		tp->Hour = 0;
+		tp->Day  = 1;
+		tp->Month  = 1;
+		tp->Year = 0;
+		return;
+	}
+#if (BITS_PER_LONG == 64)
+	if (second >= UNIX_SECS_2108) {
+		tp->Second  = 59;
+		tp->Minute  = 59;
+		tp->Hour = 23;
+		tp->Day  = 31;
+		tp->Month  = 12;
+		tp->Year = 127;
+		return;
+	}
+#endif
+	day = second / SECS_PER_DAY - DAYS_DELTA_DECADE;
+	year = day / 365;
+	MAKE_LEAP_YEAR(ld, year);
+	if (year * 365 + ld > day)
+		year--;
+
+	MAKE_LEAP_YEAR(ld, year);
+	day -= year * 365 + ld;
+
+	if (IS_LEAP_YEAR(year) && day == accum_days_in_year[3]) {
+		month = 2;
+	} else {
+		if (IS_LEAP_YEAR(year) && day > accum_days_in_year[3])
+			day--;
+		for (month = 1; month < 12; month++) {
+			if (accum_days_in_year[month + 1] > day)
+				break;
+		}
+	}
+	day -= accum_days_in_year[month];
+
+	tp->Second  = second % SECS_PER_MIN;
+	tp->Minute  = (second / SECS_PER_MIN) % 60;
+	tp->Hour = (second / SECS_PER_HOUR) % 24;
+	tp->Day  = day + 1;
+	tp->Month  = month;
+	tp->Year = year;
+}
+
+static struct inode *exfat_iget(struct super_block *sb, loff_t i_pos);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+static int exfat_generic_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+#else
+static long exfat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+#endif
+static int exfat_sync_inode(struct inode *inode);
+static struct inode *exfat_build_inode(struct super_block *sb, FILE_ID_T *fid, loff_t i_pos);
+static void exfat_detach(struct inode *inode);
+static void exfat_attach(struct inode *inode, loff_t i_pos);
+static inline unsigned long exfat_hash(loff_t i_pos);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34)
+static int exfat_write_inode(struct inode *inode, int wait);
+#else
+static int exfat_write_inode(struct inode *inode, struct writeback_control *wbc);
+#endif
+static void exfat_write_super(struct super_block *sb);
+
+static void __lock_super(struct super_block *sb)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	lock_super(sb);
+#else
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	mutex_lock(&sbi->s_lock);
+#endif
+}
+
+static void __unlock_super(struct super_block *sb)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	unlock_super(sb);
+#else
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	mutex_unlock(&sbi->s_lock);
+#endif
+}
+
+static int __is_sb_dirty(struct super_block *sb)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	return sb->s_dirt;
+#else
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	return sbi->s_dirt;
+#endif
+}
+
+static void __set_sb_clean(struct super_block *sb)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	sb->s_dirt = 0;
+#else
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	sbi->s_dirt = 0;
+#endif
+}
+
+static int __exfat_revalidate(struct dentry *dentry)
+{
+	return 0;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+static int exfat_revalidate(struct dentry *dentry, unsigned int flags)
+#else
+static int exfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,00)
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+#endif
+
+	if (dentry->d_inode)
+		return 1;
+	return __exfat_revalidate(dentry);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+static int exfat_revalidate_ci(struct dentry *dentry, unsigned int flags)
+#else
+static int exfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+#else
+	unsigned int flags;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,00)
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+#endif
+
+	flags = nd ? nd->flags : 0;
+#endif
+
+	if (dentry->d_inode)
+		return 1;
+
+	if (!flags)
+		return 0;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,00)
+	if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+		return 0;
+#else
+	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+		if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+			return 0;
+	}
+#endif
+
+	return __exfat_revalidate(dentry);
+}
+
+static unsigned int __exfat_striptail_len(unsigned int len, const char *name)
+{
+	while (len && name[len - 1] == '.')
+		len--;
+	return len;
+}
+
+static unsigned int exfat_striptail_len(const struct qstr *qstr)
+{
+	return __exfat_striptail_len(qstr->len, qstr->name);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+static int exfat_d_hash(const struct dentry *dentry, struct qstr *qstr)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+static int exfat_d_hash(struct dentry *dentry, struct qstr *qstr)
+#else
+static int exfat_d_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+#endif
+{
+	qstr->hash = full_name_hash(qstr->name, exfat_striptail_len(qstr));
+	return 0;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+static int exfat_d_hashi(const struct dentry *dentry, struct qstr *qstr)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+static int exfat_d_hashi(struct dentry *dentry, struct qstr *qstr)
+#else
+static int exfat_d_hashi(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+#endif
+{
+	struct super_block *sb = dentry->d_sb;
+	const unsigned char *name;
+	unsigned int len;
+	unsigned long hash;
+
+	name = qstr->name;
+	len = exfat_striptail_len(qstr);
+
+	hash = init_name_hash();
+	while (len--)
+		hash = partial_name_hash(nls_upper(sb, *name++), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+static int exfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+static int exfat_cmpi(struct dentry *parent, struct qstr *a, struct qstr *b)
+#else
+static int exfat_cmpi(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+#endif
+{
+	struct nls_table *t = EXFAT_SB(parent->d_sb)->nls_io;
+	unsigned int alen, blen;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+	alen = exfat_striptail_len(a);
+	blen = exfat_striptail_len(b);
+#else
+	alen = exfat_striptail_len(name);
+	blen = __exfat_striptail_len(len, str);
+#endif
+	if (alen == blen) {
+		if (t == NULL) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+			if (strncasecmp(a->name, b->name, alen) == 0)
+#else
+			if (strncasecmp(name->name, str, alen) == 0)
+#endif
+				return 0;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+		} else if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+#else
+		} else if (nls_strnicmp(t, name->name, str, alen) == 0)
+#endif
+			return 0;
+	}
+	return 1;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+static int exfat_cmp(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+static int exfat_cmp(struct dentry *parent, struct qstr *a,
+			struct qstr *b)
+#else
+static int exfat_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+#endif
+{
+	unsigned int alen, blen;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+	alen = exfat_striptail_len(a);
+	blen = exfat_striptail_len(b);
+#else
+	alen = exfat_striptail_len(name);
+	blen = __exfat_striptail_len(len, str);
+#endif
+	if (alen == blen) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+		if (strncmp(a->name, b->name, alen) == 0)
+#else
+		if (strncmp(name->name, str, alen) == 0)
+#endif
+			return 0;
+	}
+	return 1;
+}
+
+static const struct dentry_operations exfat_ci_dentry_ops = {
+	.d_revalidate   = exfat_revalidate_ci,
+	.d_hash         = exfat_d_hashi,
+	.d_compare      = exfat_cmpi,
+};
+
+static const struct dentry_operations exfat_dentry_ops = {
+	.d_revalidate   = exfat_revalidate,
+	.d_hash         = exfat_d_hash,
+	.d_compare      = exfat_cmp,
+};
+
+/*======================================================================*/
+/*  Directory Entry Operations                                          */
+/*======================================================================*/
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+static int exfat_readdir(struct file *filp, struct dir_context *ctx)
+#else
+static int exfat_readdir(struct file *filp, void *dirent, filldir_t filldir)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0)
+	struct inode *inode = file_inode(filp);
+#else
+	struct inode *inode = filp->f_path.dentry->d_inode;
+#endif
+	struct super_block *sb = inode->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	DIR_ENTRY_T de;
+	unsigned long inum;
+	loff_t cpos;
+	int err = 0;
+
+	__lock_super(sb);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	cpos = ctx->pos;
+#else
+	cpos = filp->f_pos;
+#endif
+	/* Fake . and .. for the root directory. */
+	if ((p_fs->vol_type == EXFAT) || (inode->i_ino == EXFAT_ROOT_INO)) {
+		while (cpos < 2) {
+			if (inode->i_ino == EXFAT_ROOT_INO)
+				inum = EXFAT_ROOT_INO;
+			else if (cpos == 0)
+				inum = inode->i_ino;
+			else /* (cpos == 1) */
+				inum = parent_ino(filp->f_path.dentry);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+			if (!dir_emit_dots(filp, ctx))
+#else
+			if (filldir(dirent, "..", cpos+1, cpos, inum, DT_DIR) < 0)
+#endif
+				goto out;
+			cpos++;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+			ctx->pos++;
+#else
+			filp->f_pos++;
+#endif
+		}
+		if (cpos == 2)
+			cpos = 0;
+	}
+	if (cpos & (DENTRY_SIZE - 1)) {
+		err = -ENOENT;
+		goto out;
+	}
+
+get_new:
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+	EXFAT_I(inode)->fid.rwoffset = cpos >> DENTRY_SIZE_BITS;
+
+	err = FsReadDir(inode, &de);
+	if (err) {
+		/* at least we tried to read a sector
+		 * move cpos to next sector position (should be aligned)
+		 */
+		if (err == FFS_MEDIAERR) {
+			cpos += 1 << p_bd->sector_size_bits;
+			cpos &= ~((1 << p_bd->sector_size_bits)-1);
+		}
+
+		err = -EIO;
+		goto end_of_dir;
+	}
+
+	cpos = EXFAT_I(inode)->fid.rwoffset << DENTRY_SIZE_BITS;
+
+	if (!de.Name[0])
+		goto end_of_dir;
+
+	if (!memcmp(de.ShortName, DOS_CUR_DIR_NAME, DOS_NAME_LENGTH)) {
+		inum = inode->i_ino;
+	} else if (!memcmp(de.ShortName, DOS_PAR_DIR_NAME, DOS_NAME_LENGTH)) {
+		inum = parent_ino(filp->f_path.dentry);
+	} else {
+		loff_t i_pos = ((loff_t) EXFAT_I(inode)->fid.start_clu << 32) |
+					   ((EXFAT_I(inode)->fid.rwoffset-1) & 0xffffffff);
+
+		struct inode *tmp = exfat_iget(sb, i_pos);
+		if (tmp) {
+			inum = tmp->i_ino;
+			iput(tmp);
+		} else {
+			inum = iunique(sb, EXFAT_ROOT_INO);
+		}
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	if (!dir_emit(ctx, de.Name, strlen(de.Name), inum,
+		(de.Attr & ATTR_SUBDIR) ? DT_DIR : DT_REG))
+#else
+	if (filldir(dirent, de.Name, strlen(de.Name), cpos-1, inum,
+				(de.Attr & ATTR_SUBDIR) ? DT_DIR : DT_REG) < 0)
+#endif
+		goto out;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	ctx->pos = cpos;
+#else
+	filp->f_pos = cpos;
+#endif
+	goto get_new;
+
+end_of_dir:
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	ctx->pos = cpos;
+#else
+	filp->f_pos = cpos;
+#endif
+out:
+	__unlock_super(sb);
+	return err;
+}
+
+static int exfat_ioctl_volume_id(struct inode *dir)
+{
+	struct super_block *sb = dir->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+
+	return p_fs->vol_id;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+static int exfat_generic_ioctl(struct inode *inode, struct file *filp,
+							   unsigned int cmd, unsigned long arg)
+#else
+static long exfat_generic_ioctl(struct file *filp,
+								unsigned int cmd, unsigned long arg)
+#endif
+{
+#if !(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36))
+    #if !(LINUX_VERSION_CODE < KERNEL_VERSION(3,18,3))
+		  struct inode *inode = filp->f_path.dentry->d_inode;
+    #else
+		  struct inode *inode = filp->f_dentry->d_inode;
+	#endif
+#endif
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	unsigned int flags;
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+	switch (cmd) {
+	case EXFAT_IOCTL_GET_VOLUME_ID:
+		return exfat_ioctl_volume_id(inode);
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	case EXFAT_IOC_GET_DEBUGFLAGS: {
+		struct super_block *sb = inode->i_sb;
+		struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+		flags = sbi->debug_flags;
+		return put_user(flags, (int __user *)arg);
+	}
+	case EXFAT_IOC_SET_DEBUGFLAGS: {
+		struct super_block *sb = inode->i_sb;
+		struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		__lock_super(sb);
+		sbi->debug_flags = flags;
+		__unlock_super(sb);
+
+		return 0;
+	}
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+	default:
+		return -ENOTTY; /* Inappropriate ioctl for device */
+	}
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35)
+static int exfat_file_fsync(struct file *filp, struct dentry *dentry,
+				int datasync)
+#else
+static int exfat_file_fsync(struct file *filp, int datasync)
+#endif
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct super_block *sb = inode->i_sb;
+	int res, err;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35)
+	res = simple_fsync(filp, dentry, datasync);
+#else
+	res = generic_file_fsync(filp, datasync);
+#endif
+	err = FsSyncVol(sb, 1);
+
+	return res ? res : err;
+}
+#endif
+
+const struct file_operations exfat_dir_operations = {
+	.llseek     = generic_file_llseek,
+	.read       = generic_read_dir,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	.iterate    = exfat_readdir,
+#else
+	.readdir    = exfat_readdir,
+#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+	.ioctl      = exfat_generic_ioctl,
+	.fsync      = exfat_file_fsync,
+#else
+	.unlocked_ioctl = exfat_generic_ioctl,
+	.fsync      = generic_file_fsync,
+#endif
+};
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+						bool excl)
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0)
+static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+						struct nameidata *nd)
+#else
+static int exfat_create(struct inode *dir, struct dentry *dentry, int mode,
+						struct nameidata *nd)
+#endif
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct timespec ts;
+	FILE_ID_T fid;
+	loff_t i_pos;
+	int err;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_create entered\n");
+
+	ts = CURRENT_TIME_SEC;
+
+	err = FsCreateFile(dir, (u8 *) dentry->d_name.name, FM_REGULAR, &fid);
+	if (err) {
+		if (err == FFS_INVALIDPATH)
+			err = -EINVAL;
+		else if (err == FFS_FILEEXIST)
+			err = -EEXIST;
+		else if (err == FFS_FULL)
+			err = -ENOSPC;
+		else if (err == FFS_NAMETOOLONG)
+			err = -ENAMETOOLONG;
+		else
+			err = -EIO;
+		goto out;
+	}
+	dir->i_version++;
+	dir->i_ctime = dir->i_mtime = dir->i_atime = ts;
+	if (IS_DIRSYNC(dir))
+		(void) exfat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	i_pos = ((loff_t) fid.dir.dir << 32) | (fid.entry & 0xffffffff);
+
+	inode = exfat_build_inode(sb, &fid, i_pos);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unnecessary. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_create exited\n");
+	return err;
+}
+
+static int exfat_find(struct inode *dir, struct qstr *qname,
+					  FILE_ID_T *fid)
+{
+	int err;
+
+	if (qname->len == 0)
+		return -ENOENT;
+
+	err = FsLookupFile(dir, (u8 *) qname->name, fid);
+	if (err)
+		return -ENOENT;
+
+	return 0;
+}
+
+static int exfat_d_anon_disconn(struct dentry *dentry)
+{
+	return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry,
+						   unsigned int flags)
+#else
+static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry,
+						   struct nameidata *nd)
+#endif
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct dentry *alias;
+	int err;
+	FILE_ID_T fid;
+	loff_t i_pos;
+	u64 ret;
+	mode_t i_mode;
+
+	__lock_super(sb);
+	DPRINTK("exfat_lookup entered\n");
+	err = exfat_find(dir, &dentry->d_name, &fid);
+	if (err) {
+		if (err == -ENOENT) {
+			inode = NULL;
+			goto out;
+		}
+		goto error;
+	}
+
+	i_pos = ((loff_t) fid.dir.dir << 32) | (fid.entry & 0xffffffff);
+	inode = exfat_build_inode(sb, &fid, i_pos);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto error;
+	}
+
+	i_mode = inode->i_mode;
+	if (S_ISLNK(i_mode)) {
+		EXFAT_I(inode)->target = kmalloc(i_size_read(inode)+1, GFP_KERNEL);
+		if (!EXFAT_I(inode)->target) {
+			err = -ENOMEM;
+			goto error;
+		}
+		FsReadFile(dir, &fid, EXFAT_I(inode)->target, i_size_read(inode), &ret);
+		*(EXFAT_I(inode)->target + i_size_read(inode)) = '\0';
+	}
+
+	alias = d_find_alias(inode);
+	if (alias && !exfat_d_anon_disconn(alias)) {
+		CHECK_ERR(d_unhashed(alias));
+		if (!S_ISDIR(i_mode))
+			d_move(alias, dentry);
+		iput(inode);
+		__unlock_super(sb);
+		DPRINTK("exfat_lookup exited 1\n");
+		return alias;
+	} else {
+		dput(alias);
+	}
+out:
+	__unlock_super(sb);
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+	dentry->d_op = sb->s_root->d_op;
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry) {
+		dentry->d_op = sb->s_root->d_op;
+		dentry->d_time = dentry->d_parent->d_inode->i_version;
+	}
+#else
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry)
+		dentry->d_time = dentry->d_parent->d_inode->i_version;
+#endif
+	DPRINTK("exfat_lookup exited 2\n");
+	return dentry;
+
+error:
+	__unlock_super(sb);
+	DPRINTK("exfat_lookup exited 3\n");
+	return ERR_PTR(err);
+}
+
+static int exfat_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct timespec ts;
+	int err;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_unlink entered\n");
+
+	ts = CURRENT_TIME_SEC;
+
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+
+	err = FsRemoveFile(dir, &(EXFAT_I(inode)->fid));
+	if (err) {
+		if (err == FFS_PERMISSIONERR)
+			err = -EPERM;
+		else
+			err = -EIO;
+		goto out;
+	}
+	dir->i_version++;
+	dir->i_mtime = dir->i_atime = ts;
+	if (IS_DIRSYNC(dir))
+		(void) exfat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = ts;
+	exfat_detach(inode);
+	remove_inode_hash(inode);
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_unlink exited\n");
+	return err;
+}
+
+static int exfat_symlink(struct inode *dir, struct dentry *dentry, const char *target)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct timespec ts;
+	FILE_ID_T fid;
+	loff_t i_pos;
+	int err;
+	u64 len = (u64) strlen(target);
+	u64 ret;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_symlink entered\n");
+
+	ts = CURRENT_TIME_SEC;
+
+	err = FsCreateFile(dir, (u8 *) dentry->d_name.name, FM_SYMLINK, &fid);
+	if (err) {
+		if (err == FFS_INVALIDPATH)
+			err = -EINVAL;
+		else if (err == FFS_FILEEXIST)
+			err = -EEXIST;
+		else if (err == FFS_FULL)
+			err = -ENOSPC;
+		else
+			err = -EIO;
+		goto out;
+	}
+
+	err = FsWriteFile(dir, &fid, (char *) target, len, &ret);
+
+	if (err) {
+		FsRemoveFile(dir, &fid);
+
+		if (err == FFS_FULL)
+			err = -ENOSPC;
+		else
+			err = -EIO;
+		goto out;
+	}
+
+	dir->i_version++;
+	dir->i_ctime = dir->i_mtime = dir->i_atime = ts;
+	if (IS_DIRSYNC(dir))
+		(void) exfat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	i_pos = ((loff_t) fid.dir.dir << 32) | (fid.entry & 0xffffffff);
+
+	inode = exfat_build_inode(sb, &fid, i_pos);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	EXFAT_I(inode)->target = kmalloc(len+1, GFP_KERNEL);
+	if (!EXFAT_I(inode)->target) {
+		err = -ENOMEM;
+		goto out;
+	}
+	memcpy(EXFAT_I(inode)->target, target, len+1);
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_symlink exited\n");
+	return err;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0)
+static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+#else
+static int exfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+#endif
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct timespec ts;
+	FILE_ID_T fid;
+	loff_t i_pos;
+	int err;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_mkdir entered\n");
+
+	ts = CURRENT_TIME_SEC;
+
+	err = FsCreateDir(dir, (u8 *) dentry->d_name.name, &fid);
+	if (err) {
+		if (err == FFS_INVALIDPATH)
+			err = -EINVAL;
+		else if (err == FFS_FILEEXIST)
+			err = -EEXIST;
+		else if (err == FFS_FULL)
+			err = -ENOSPC;
+		else if (err == FFS_NAMETOOLONG)
+			err = -ENAMETOOLONG;
+		else
+			err = -EIO;
+		goto out;
+	}
+	dir->i_version++;
+	dir->i_ctime = dir->i_mtime = dir->i_atime = ts;
+	if (IS_DIRSYNC(dir))
+		(void) exfat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+	inc_nlink(dir);
+
+	i_pos = ((loff_t) fid.dir.dir << 32) | (fid.entry & 0xffffffff);
+
+	inode = exfat_build_inode(sb, &fid, i_pos);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_mkdir exited\n");
+	return err;
+}
+
+static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct timespec ts;
+	int err;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_rmdir entered\n");
+
+	ts = CURRENT_TIME_SEC;
+
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+
+	err = FsRemoveDir(dir, &(EXFAT_I(inode)->fid));
+	if (err) {
+		if (err == FFS_INVALIDPATH)
+			err = -EINVAL;
+		else if (err == FFS_FILEEXIST)
+			err = -ENOTEMPTY;
+		else if (err == FFS_NOTFOUND)
+			err = -ENOENT;
+		else if (err == FFS_DIRBUSY)
+			err = -EBUSY;
+		else
+			err = -EIO;
+		goto out;
+	}
+	dir->i_version++;
+	dir->i_mtime = dir->i_atime = ts;
+	if (IS_DIRSYNC(dir))
+		(void) exfat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = ts;
+	exfat_detach(inode);
+	remove_inode_hash(inode);
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_rmdir exited\n");
+	return err;
+}
+
+static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+						struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode, *new_inode;
+	struct super_block *sb = old_dir->i_sb;
+	struct timespec ts;
+	loff_t i_pos;
+	int err;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_rename entered\n");
+
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+
+	ts = CURRENT_TIME_SEC;
+
+	EXFAT_I(old_inode)->fid.size = i_size_read(old_inode);
+
+	err = FsMoveFile(old_dir, &(EXFAT_I(old_inode)->fid), new_dir, new_dentry);
+	if (err) {
+		if (err == FFS_PERMISSIONERR)
+			err = -EPERM;
+		else if (err == FFS_INVALIDPATH)
+			err = -EINVAL;
+		else if (err == FFS_FILEEXIST)
+			err = -EEXIST;
+		else if (err == FFS_NOTFOUND)
+			err = -ENOENT;
+		else if (err == FFS_FULL)
+			err = -ENOSPC;
+		else
+			err = -EIO;
+		goto out;
+	}
+	new_dir->i_version++;
+	new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime = ts;
+	if (IS_DIRSYNC(new_dir))
+		(void) exfat_sync_inode(new_dir);
+	else
+		mark_inode_dirty(new_dir);
+
+	i_pos = ((loff_t) EXFAT_I(old_inode)->fid.dir.dir << 32) |
+			(EXFAT_I(old_inode)->fid.entry & 0xffffffff);
+
+	exfat_detach(old_inode);
+	exfat_attach(old_inode, i_pos);
+	if (IS_DIRSYNC(new_dir))
+		(void) exfat_sync_inode(old_inode);
+	else
+		mark_inode_dirty(old_inode);
+
+	if ((S_ISDIR(old_inode->i_mode)) && (old_dir != new_dir)) {
+		drop_nlink(old_dir);
+		if (!new_inode)
+			inc_nlink(new_dir);
+	}
+
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void) exfat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		exfat_detach(new_inode);
+		drop_nlink(new_inode);
+		if (S_ISDIR(new_inode->i_mode))
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_rename exited\n");
+	return err;
+}
+
+static int exfat_cont_expand(struct inode *inode, loff_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	loff_t start = i_size_read(inode), count = size - i_size_read(inode);
+	int err, err2;
+
+	err = generic_cont_expand_simple(inode, size);
+	if (err != 0)
+		return err;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+
+	if (IS_SYNC(inode)) {
+		err = filemap_fdatawrite_range(mapping, start, start + count - 1);
+		err2 = sync_mapping_buffers(mapping);
+		err = (err) ? (err) : (err2);
+		err2 = write_inode_now(inode, 1);
+		err = (err) ? (err) : (err2);
+		if (!err)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32)
+			err =  wait_on_page_writeback_range(mapping,
+					start >> PAGE_CACHE_SHIFT,
+					(start + count - 1) >> PAGE_CACHE_SHIFT);
+#else
+			err =  filemap_fdatawait_range(mapping, start, start + count - 1);
+#endif
+	}
+	return err;
+}
+
+static int exfat_allow_set_time(struct exfat_sb_info *sbi, struct inode *inode)
+{
+	mode_t allow_utime = sbi->options.allow_utime;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+	if (!uid_eq(current_fsuid(), inode->i_uid))
+#else
+	if (current_fsuid() != inode->i_uid)
+#endif
+	{
+		if (in_group_p(inode->i_gid))
+			allow_utime >>= 3;
+		if (allow_utime & MAY_WRITE)
+			return 1;
+	}
+
+	/* use a default check */
+	return 0;
+}
+
+static int exfat_sanitize_mode(const struct exfat_sb_info *sbi,
+							   struct inode *inode, umode_t *mode_ptr)
+{
+	mode_t i_mode, mask, perm;
+
+	i_mode = inode->i_mode;
+
+	if (S_ISREG(i_mode) || S_ISLNK(i_mode))
+		mask = sbi->options.fs_fmask;
+	else
+		mask = sbi->options.fs_dmask;
+
+	perm = *mode_ptr & ~(S_IFMT | mask);
+
+	/* Of the r and x bits, all (subject to umask) must be present.*/
+	if ((perm & (S_IRUGO | S_IXUGO)) != (i_mode & (S_IRUGO|S_IXUGO)))
+		return -EPERM;
+
+	if (exfat_mode_can_hold_ro(inode)) {
+		/* Of the w bits, either all (subject to umask) or none must be present. */
+		if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
+			return -EPERM;
+	} else {
+		/* If exfat_mode_can_hold_ro(inode) is false, can't change w bits. */
+		if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
+			return -EPERM;
+	}
+
+	*mode_ptr &= S_IFMT | perm;
+
+	return 0;
+}
+
+static int exfat_setattr(struct dentry *dentry, struct iattr *attr)
+{
+
+	struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb);
+	struct inode *inode = dentry->d_inode;
+	unsigned int ia_valid;
+	int error;
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,35)
+	loff_t old_size;
+#endif
+
+	DPRINTK("exfat_setattr entered\n");
+
+	if ((attr->ia_valid & ATTR_SIZE)
+		&& (attr->ia_size > i_size_read(inode))) {
+		error = exfat_cont_expand(inode, attr->ia_size);
+		if (error || attr->ia_valid == ATTR_SIZE)
+			return error;
+		attr->ia_valid &= ~ATTR_SIZE;
+	}
+
+	ia_valid = attr->ia_valid;
+
+	if ((ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET))
+		&& exfat_allow_set_time(sbi, inode)) {
+		attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET);
+	}
+
+	error = inode_change_ok(inode, attr);
+	attr->ia_valid = ia_valid;
+	if (error)
+		return error;
+
+	if (((attr->ia_valid & ATTR_UID) &&
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+		 (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) ||
+		((attr->ia_valid & ATTR_GID) &&
+		 (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) ||
+#else
+		 (attr->ia_uid != sbi->options.fs_uid)) ||
+		((attr->ia_valid & ATTR_GID) &&
+		 (attr->ia_gid != sbi->options.fs_gid)) ||
+#endif
+		((attr->ia_valid & ATTR_MODE) &&
+		 (attr->ia_mode & ~(S_IFREG | S_IFLNK | S_IFDIR | S_IRWXUGO)))) {
+		return -EPERM;
+	}
+
+	/*
+	 * We don't return -EPERM here. Yes, strange, but this is too
+	 * old behavior.
+	 */
+	if (attr->ia_valid & ATTR_MODE) {
+		if (exfat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
+			attr->ia_valid &= ~ATTR_MODE;
+	}
+
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+	if (attr->ia_valid)
+		error = inode_setattr(inode, attr);
+#else
+	if (attr->ia_valid & ATTR_SIZE) {
+		old_size = i_size_read(inode);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,00)
+		down_write(&EXFAT_I(inode)->truncate_lock);
+		truncate_setsize(inode, attr->ia_size);
+		_exfat_truncate(inode, old_size);
+		up_write(&EXFAT_I(inode)->truncate_lock);
+#else
+		truncate_setsize(inode, attr->ia_size);
+		_exfat_truncate(inode, old_size);
+#endif
+	}
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+#endif
+
+	DPRINTK("exfat_setattr exited\n");
+	return error;
+}
+
+static int exfat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+
+	DPRINTK("exfat_getattr entered\n");
+
+	generic_fillattr(inode, stat);
+	stat->blksize = EXFAT_SB(inode->i_sb)->fs_info.cluster_size;
+
+	DPRINTK("exfat_getattr exited\n");
+	return 0;
+}
+
+const struct inode_operations exfat_dir_inode_operations = {
+	.create        = exfat_create,
+	.lookup        = exfat_lookup,
+	.unlink        = exfat_unlink,
+	.symlink       = exfat_symlink,
+	.mkdir         = exfat_mkdir,
+	.rmdir         = exfat_rmdir,
+	.rename        = exfat_rename,
+	.setattr       = exfat_setattr,
+	.getattr       = exfat_getattr,
+};
+
+/*======================================================================*/
+/*  File Operations                                                     */
+/*======================================================================*/
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
+static const char *exfat_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done)
+{
+	struct exfat_inode_info *ei = EXFAT_I(inode);
+	if (ei->target != NULL) {
+		char *cookie = ei->target;
+		if (cookie != NULL) {
+			return (char *)(ei->target);
+		}
+	}
+	return NULL;
+}
+#elif LINUX_VERSION_CODE > KERNEL_VERSION(4,1,0)
+static const char *exfat_follow_link(struct dentry *dentry, void **cookie)
+{
+	struct exfat_inode_info *ei = EXFAT_I(dentry->d_inode);
+	return *cookie = (char *)(ei->target);
+}
+#else
+static void *exfat_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct exfat_inode_info *ei = EXFAT_I(dentry->d_inode);
+	nd_set_link(nd, (char *)(ei->target));
+	return NULL;
+}
+#endif
+
+const struct inode_operations exfat_symlink_inode_operations = {
+	.readlink    = generic_readlink,
+	#if LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0)
+		.follow_link = exfat_follow_link,
+	#endif
+	#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
+		.get_link = exfat_get_link,
+	#endif
+};
+
+static int exfat_file_release(struct inode *inode, struct file *filp)
+{
+	struct super_block *sb = inode->i_sb;
+
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+	FsSyncVol(sb, 0);
+	return 0;
+}
+
+const struct file_operations exfat_file_operations = {
+	.llseek      = generic_file_llseek,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0)
+	.read        = do_sync_read,
+	.write       = do_sync_write,
+	.aio_read    = generic_file_aio_read,
+	.aio_write   = generic_file_aio_write,
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
+	.read        = new_sync_read,
+	.write       = new_sync_write,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
+	.read_iter   = generic_file_read_iter,
+	.write_iter  = generic_file_write_iter,
+#endif
+	.mmap        = generic_file_mmap,
+	.release     = exfat_file_release,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+	.ioctl       = exfat_generic_ioctl,
+	.fsync       = exfat_file_fsync,
+#else
+	.unlocked_ioctl  = exfat_generic_ioctl,
+	.fsync       = generic_file_fsync,
+#endif
+	.splice_read = generic_file_splice_read,
+};
+
+static void _exfat_truncate(struct inode *inode, loff_t old_size)
+{
+	struct super_block *sb = inode->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+	int err;
+
+	__lock_super(sb);
+
+	/*
+	 * This protects against truncating a file bigger than it was then
+	 * trying to write into the hole.
+	 */
+	if (EXFAT_I(inode)->mmu_private > i_size_read(inode))
+		EXFAT_I(inode)->mmu_private = i_size_read(inode);
+
+	if (EXFAT_I(inode)->fid.start_clu == 0)
+		goto out;
+
+	err = FsTruncateFile(inode, old_size, i_size_read(inode));
+	if (err)
+		goto out;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	if (IS_DIRSYNC(inode))
+		(void) exfat_sync_inode(inode);
+	else
+		mark_inode_dirty(inode);
+
+	inode->i_blocks = ((i_size_read(inode) + (p_fs->cluster_size - 1))
+					   & ~((loff_t)p_fs->cluster_size - 1)) >> 9;
+out:
+	__unlock_super(sb);
+}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,36)
+static void exfat_truncate(struct inode *inode)
+{
+	_exfat_truncate(inode, i_size_read(inode));
+}
+#endif
+
+const struct inode_operations exfat_file_inode_operations = {
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,36)
+	.truncate    = exfat_truncate,
+#endif
+	.setattr     = exfat_setattr,
+	.getattr     = exfat_getattr,
+};
+
+/*======================================================================*/
+/*  Address Space Operations                                            */
+/*======================================================================*/
+
+static int exfat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+					  unsigned long *mapped_blocks, int *create)
+{
+	struct super_block *sb = inode->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+	BD_INFO_T *p_bd = &(sbi->bd_info);
+	const unsigned long blocksize = sb->s_blocksize;
+	const unsigned char blocksize_bits = sb->s_blocksize_bits;
+	sector_t last_block;
+	int err, clu_offset, sec_offset;
+	unsigned int cluster;
+
+	*phys = 0;
+	*mapped_blocks = 0;
+
+	if ((p_fs->vol_type == FAT12) || (p_fs->vol_type == FAT16)) {
+		if (inode->i_ino == EXFAT_ROOT_INO) {
+			if (sector < (p_fs->dentries_in_root >> (p_bd->sector_size_bits-DENTRY_SIZE_BITS))) {
+				*phys = sector + p_fs->root_start_sector;
+				*mapped_blocks = 1;
+			}
+			return 0;
+		}
+	}
+
+	last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+	if (sector >= last_block) {
+		if (*create == 0)
+			return 0;
+	} else {
+		*create = 0;
+	}
+
+	clu_offset = sector >> p_fs->sectors_per_clu_bits;  /* cluster offset */
+	sec_offset = sector & (p_fs->sectors_per_clu - 1);  /* sector offset in cluster */
+
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+
+	err = FsMapCluster(inode, clu_offset, &cluster);
+
+	if (err) {
+		if (err == FFS_FULL)
+			return -ENOSPC;
+		else
+			return -EIO;
+	} else if (cluster != CLUSTER_32(~0)) {
+		*phys = START_SECTOR(cluster) + sec_offset;
+		*mapped_blocks = p_fs->sectors_per_clu - sec_offset;
+	}
+
+	return 0;
+}
+
+static int exfat_get_block(struct inode *inode, sector_t iblock,
+						   struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int err;
+	unsigned long mapped_blocks;
+	sector_t phys;
+
+	__lock_super(sb);
+
+	err = exfat_bmap(inode, iblock, &phys, &mapped_blocks, &create);
+	if (err) {
+		__unlock_super(sb);
+		return err;
+	}
+
+	if (phys) {
+		max_blocks = min(mapped_blocks, max_blocks);
+		if (create) {
+			EXFAT_I(inode)->mmu_private += max_blocks << sb->s_blocksize_bits;
+			set_buffer_new(bh_result);
+		}
+		map_bh(bh_result, sb, phys);
+	}
+
+	bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+	__unlock_super(sb);
+
+	return 0;
+}
+
+static int exfat_readpage(struct file *file, struct page *page)
+{
+	int ret;
+	ret =  mpage_readpage(page, exfat_get_block);
+	return ret;
+}
+
+static int exfat_readpages(struct file *file, struct address_space *mapping,
+				   struct list_head *pages, unsigned nr_pages)
+{
+	int ret;
+	ret =  mpage_readpages(mapping, pages, nr_pages, exfat_get_block);
+	return ret;
+}
+
+static int exfat_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+	ret = block_write_full_page(page, exfat_get_block, wbc);
+	return ret;
+}
+
+static int exfat_writepages(struct address_space *mapping,
+						struct writeback_control *wbc)
+{
+	int ret;
+	ret = mpage_writepages(mapping, wbc, exfat_get_block);
+	return ret;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+static void exfat_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+	if (to > i_size_read(inode)) {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0)
+		truncate_pagecache(inode, i_size_read(inode));
+#else
+		truncate_pagecache(inode, to, i_size_read(inode));
+#endif
+		EXFAT_I(inode)->fid.size = i_size_read(inode);
+		_exfat_truncate(inode, i_size_read(inode));
+	}
+}
+#endif
+
+static int exfat_write_begin(struct file *file, struct address_space *mapping,
+				 loff_t pos, unsigned len, unsigned flags,
+					 struct page **pagep, void **fsdata)
+{
+	int ret;
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				   exfat_get_block,
+				   &EXFAT_I(mapping->host)->mmu_private);
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+	if (ret < 0)
+		exfat_write_failed(mapping, pos+len);
+#endif
+	return ret;
+}
+
+static int exfat_write_end(struct file *file, struct address_space *mapping,
+				   loff_t pos, unsigned len, unsigned copied,
+					   struct page *pagep, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+	int err;
+
+	err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+	if (err < len)
+		exfat_write_failed(mapping, pos+len);
+#endif
+
+	if (!(err < 0) && !(fid->attr & ATTR_ARCHIVE)) {
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+		fid->attr |= ATTR_ARCHIVE;
+		mark_inode_dirty(inode);
+	}
+	return err;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0)
+#ifdef CONFIG_AIO_OPTIMIZATION
+static ssize_t exfat_direct_IO(int rw, struct kiocb *iocb,
+						struct iov_iter *iter, loff_t offset)
+#else
+static ssize_t exfat_direct_IO(int rw, struct kiocb *iocb,
+					   const struct iovec *iov,
+					   loff_t offset, unsigned long nr_segs)
+#endif
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)
+static ssize_t exfat_direct_IO(int rw, struct kiocb *iocb,
+					   struct iov_iter *iter, loff_t offset)
+#else /* >= 4.1.x */
+static ssize_t exfat_direct_IO(struct kiocb *iocb,
+					   struct iov_iter *iter, loff_t offset)
+#endif
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+#endif
+	ssize_t ret;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,2,0)
+	int rw;
+
+	rw = iov_iter_rw(iter);
+#endif
+
+	if (rw == WRITE) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0)
+#ifdef CONFIG_AIO_OPTIMIZATION
+		if (EXFAT_I(inode)->mmu_private <
+					(offset + iov_iter_count(iter)))
+#else
+		if (EXFAT_I(inode)->mmu_private < (offset + iov_length(iov, nr_segs)))
+#endif
+#else
+		if (EXFAT_I(inode)->mmu_private < (offset + iov_iter_count(iter)))
+#endif
+			return 0;
+	}
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0)
+	ret = blockdev_direct_IO(iocb, inode, iter,
+					offset, exfat_get_block);
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
+	ret = blockdev_direct_IO(rw, iocb, inode, iter,
+					offset, exfat_get_block);
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)
+#ifdef CONFIG_AIO_OPTIMIZATION
+	ret = blockdev_direct_IO(rw, iocb, inode, iter,
+					offset, exfat_get_block);
+#else
+	ret = blockdev_direct_IO(rw, iocb, inode, iov,
+					offset, nr_segs, exfat_get_block);
+#endif
+#else
+        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+					offset, nr_segs, exfat_get_block, NULL);
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
+	if ((ret < 0) && (rw & WRITE))
+		exfat_write_failed(mapping, offset+iov_iter_count(iter));
+#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+	if ((ret < 0) && (rw & WRITE))
+#ifdef CONFIG_AIO_OPTIMIZATION
+		exfat_write_failed(mapping, offset+iov_iter_count(iter));
+#else
+		exfat_write_failed(mapping, offset+iov_length(iov, nr_segs));
+#endif
+#endif
+	return ret;
+}
+
+static sector_t _exfat_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t blocknr;
+
+	/* exfat_get_cluster() assumes the requested blocknr isn't truncated. */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,00)
+	down_read(&EXFAT_I(mapping->host)->truncate_lock);
+	blocknr = generic_block_bmap(mapping, block, exfat_get_block);
+	up_read(&EXFAT_I(mapping->host)->truncate_lock);
+#else
+	down_read(&EXFAT_I(mapping->host)->i_alloc_sem);
+	blocknr = generic_block_bmap(mapping, block, exfat_get_block);
+	up_read(&EXFAT_I(mapping->host)->i_alloc_sem);
+#endif
+
+	return blocknr;
+}
+
+const struct address_space_operations exfat_aops = {
+	.readpage    = exfat_readpage,
+	.readpages   = exfat_readpages,
+	.writepage   = exfat_writepage,
+	.writepages  = exfat_writepages,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+	.sync_page   = block_sync_page,
+#endif
+	.write_begin = exfat_write_begin,
+	.write_end   = exfat_write_end,
+	.direct_IO   = exfat_direct_IO,
+	.bmap        = _exfat_bmap
+};
+
+/*======================================================================*/
+/*  Super Operations                                                    */
+/*======================================================================*/
+
+static inline unsigned long exfat_hash(loff_t i_pos)
+{
+	return hash_32(i_pos, EXFAT_HASH_BITS);
+}
+
+static struct inode *exfat_iget(struct super_block *sb, loff_t i_pos)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	struct exfat_inode_info *info;
+	struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos);
+	struct inode *inode = NULL;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0)
+	struct hlist_node *node;
+
+	spin_lock(&sbi->inode_hash_lock);
+	hlist_for_each_entry(info, node, head, i_hash_fat) {
+#else
+	spin_lock(&sbi->inode_hash_lock);
+	hlist_for_each_entry(info, head, i_hash_fat) {
+#endif
+		CHECK_ERR(info->vfs_inode.i_sb != sb);
+
+		if (i_pos != info->i_pos)
+			continue;
+		inode = igrab(&info->vfs_inode);
+		if (inode)
+			break;
+	}
+	spin_unlock(&sbi->inode_hash_lock);
+	return inode;
+}
+
+static void exfat_attach(struct inode *inode, loff_t i_pos)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+	struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos);
+
+	spin_lock(&sbi->inode_hash_lock);
+	EXFAT_I(inode)->i_pos = i_pos;
+	hlist_add_head(&EXFAT_I(inode)->i_hash_fat, head);
+	spin_unlock(&sbi->inode_hash_lock);
+}
+
+static void exfat_detach(struct inode *inode)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+
+	spin_lock(&sbi->inode_hash_lock);
+	hlist_del_init(&EXFAT_I(inode)->i_hash_fat);
+	EXFAT_I(inode)->i_pos = 0;
+	spin_unlock(&sbi->inode_hash_lock);
+}
+
+/* doesn't deal with root inode */
+static int exfat_fill_inode(struct inode *inode, FILE_ID_T *fid)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+	DIR_ENTRY_T info;
+
+	memcpy(&(EXFAT_I(inode)->fid), fid, sizeof(FILE_ID_T));
+
+	FsReadStat(inode, &info);
+
+	EXFAT_I(inode)->i_pos = 0;
+	EXFAT_I(inode)->target = NULL;
+	inode->i_uid = sbi->options.fs_uid;
+	inode->i_gid = sbi->options.fs_gid;
+	inode->i_version++;
+	inode->i_generation = get_seconds();
+
+	if (info.Attr & ATTR_SUBDIR) { /* directory */
+		inode->i_generation &= ~1;
+		inode->i_mode = exfat_make_mode(sbi, info.Attr, S_IRWXUGO);
+		inode->i_op = &exfat_dir_inode_operations;
+		inode->i_fop = &exfat_dir_operations;
+
+		i_size_write(inode, info.Size);
+		EXFAT_I(inode)->mmu_private = i_size_read(inode);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,00)
+		set_nlink(inode, info.NumSubdirs);
+#else
+		inode->i_nlink = info.NumSubdirs;
+#endif
+	} else if (info.Attr & ATTR_SYMLINK) { /* symbolic link */
+		inode->i_generation |= 1;
+		inode->i_mode = exfat_make_mode(sbi, info.Attr, S_IRWXUGO);
+		inode->i_op = &exfat_symlink_inode_operations;
+
+		i_size_write(inode, info.Size);
+		EXFAT_I(inode)->mmu_private = i_size_read(inode);
+	} else { /* regular file */
+		inode->i_generation |= 1;
+		inode->i_mode = exfat_make_mode(sbi, info.Attr, S_IRWXUGO);
+		inode->i_op = &exfat_file_inode_operations;
+		inode->i_fop = &exfat_file_operations;
+		inode->i_mapping->a_ops = &exfat_aops;
+		inode->i_mapping->nrpages = 0;
+
+		i_size_write(inode, info.Size);
+		EXFAT_I(inode)->mmu_private = i_size_read(inode);
+	}
+	exfat_save_attr(inode, info.Attr);
+
+	inode->i_blocks = ((i_size_read(inode) + (p_fs->cluster_size - 1))
+					   & ~((loff_t)p_fs->cluster_size - 1)) >> 9;
+
+	exfat_time_fat2unix(sbi, &inode->i_mtime, &info.ModifyTimestamp);
+	exfat_time_fat2unix(sbi, &inode->i_ctime, &info.CreateTimestamp);
+	exfat_time_fat2unix(sbi, &inode->i_atime, &info.AccessTimestamp);
+
+	return 0;
+}
+
+static struct inode *exfat_build_inode(struct super_block *sb,
+									   FILE_ID_T *fid, loff_t i_pos) {
+	struct inode *inode;
+	int err;
+
+	inode = exfat_iget(sb, i_pos);
+	if (inode)
+		goto out;
+	inode = new_inode(sb);
+	if (!inode) {
+		inode = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	inode->i_ino = iunique(sb, EXFAT_ROOT_INO);
+	inode->i_version = 1;
+	err = exfat_fill_inode(inode, fid);
+	if (err) {
+		iput(inode);
+		inode = ERR_PTR(err);
+		goto out;
+	}
+	exfat_attach(inode, i_pos);
+	insert_inode_hash(inode);
+out:
+	return inode;
+}
+
+static int exfat_sync_inode(struct inode *inode)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34)
+	return exfat_write_inode(inode, 0);
+#else
+	return exfat_write_inode(inode, NULL);
+#endif
+}
+
+static struct inode *exfat_alloc_inode(struct super_block *sb)
+{
+	struct exfat_inode_info *ei;
+
+	ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,00)
+	init_rwsem(&ei->truncate_lock);
+#endif
+
+	return &ei->vfs_inode;
+}
+
+static void exfat_destroy_inode(struct inode *inode)
+{
+	if (EXFAT_I(inode)->target)
+		kfree(EXFAT_I(inode)->target);
+	EXFAT_I(inode)->target = NULL;
+
+	kmem_cache_free(exfat_inode_cachep, EXFAT_I(inode));
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34)
+static int exfat_write_inode(struct inode *inode, int wait)
+#else
+static int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
+#endif
+{
+	struct super_block *sb = inode->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	DIR_ENTRY_T info;
+
+	if (inode->i_ino == EXFAT_ROOT_INO)
+		return 0;
+
+	info.Attr = exfat_make_attr(inode);
+	info.Size = i_size_read(inode);
+
+	exfat_time_unix2fat(sbi, &inode->i_mtime, &info.ModifyTimestamp);
+	exfat_time_unix2fat(sbi, &inode->i_ctime, &info.CreateTimestamp);
+	exfat_time_unix2fat(sbi, &inode->i_atime, &info.AccessTimestamp);
+
+	FsWriteStat(inode, &info);
+
+	return 0;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+static void exfat_delete_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+}
+
+static void exfat_clear_inode(struct inode *inode)
+{
+	exfat_detach(inode);
+	remove_inode_hash(inode);
+}
+#else
+static void exfat_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (!inode->i_nlink)
+		i_size_write(inode, 0);
+	invalidate_inode_buffers(inode);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
+	end_writeback(inode);
+#else
+	clear_inode(inode);
+#endif
+	exfat_detach(inode);
+
+	remove_inode_hash(inode);
+}
+#endif
+
+static void exfat_free_super(struct exfat_sb_info *sbi)
+{
+	if (sbi->nls_disk)
+		unload_nls(sbi->nls_disk);
+	if (sbi->nls_io)
+		unload_nls(sbi->nls_io);
+	if (sbi->options.iocharset != exfat_default_iocharset)
+		kfree(sbi->options.iocharset);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,0)
+	/* mutex_init is in exfat_fill_super function. only for 3.7+ */
+	mutex_destroy(&sbi->s_lock);
+#endif
+	kfree(sbi);
+}
+
+static void exfat_put_super(struct super_block *sb)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	if (__is_sb_dirty(sb))
+		exfat_write_super(sb);
+
+	FsUmountVol(sb);
+
+	sb->s_fs_info = NULL;
+	exfat_free_super(sbi);
+}
+
+static void exfat_write_super(struct super_block *sb)
+{
+	__lock_super(sb);
+
+	__set_sb_clean(sb);
+
+	if (!(sb->s_flags & MS_RDONLY))
+		FsSyncVol(sb, 1);
+
+	__unlock_super(sb);
+}
+
+static int exfat_sync_fs(struct super_block *sb, int wait)
+{
+	int err = 0;
+
+	if (__is_sb_dirty(sb)) {
+		__lock_super(sb);
+		__set_sb_clean(sb);
+		err = FsSyncVol(sb, 1);
+		__unlock_super(sb);
+	}
+
+	return err;
+}
+
+static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	VOL_INFO_T info;
+
+	if (p_fs->used_clusters == (u32) ~0) {
+		if (FFS_MEDIAERR == FsGetVolInfo(sb, &info))
+			return -EIO;
+
+	} else {
+		info.FatType = p_fs->vol_type;
+		info.ClusterSize = p_fs->cluster_size;
+		info.NumClusters = p_fs->num_clusters - 2;
+		info.UsedClusters = p_fs->used_clusters;
+		info.FreeClusters = info.NumClusters - info.UsedClusters;
+
+		if (p_fs->dev_ejected)
+			printk("[EXFAT] statfs on device is ejected\n");
+	}
+
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = info.ClusterSize;
+	buf->f_blocks = info.NumClusters;
+	buf->f_bfree = info.FreeClusters;
+	buf->f_bavail = info.FreeClusters;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = 260;
+
+	return 0;
+}
+
+static int exfat_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_NODIRATIME;
+	return 0;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0)
+static int exfat_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(root->d_sb);
+#else
+static int exfat_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(mnt->mnt_sb);
+#endif
+	struct exfat_mount_options *opts = &sbi->options;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+	if (__kuid_val(opts->fs_uid))
+		seq_printf(m, ",uid=%u", __kuid_val(opts->fs_uid));
+	if (__kgid_val(opts->fs_gid))
+		seq_printf(m, ",gid=%u", __kgid_val(opts->fs_gid));
+#else
+	if (opts->fs_uid != 0)
+		seq_printf(m, ",uid=%u", opts->fs_uid);
+	if (opts->fs_gid != 0)
+		seq_printf(m, ",gid=%u", opts->fs_gid);
+#endif
+	seq_printf(m, ",fmask=%04o", opts->fs_fmask);
+	seq_printf(m, ",dmask=%04o", opts->fs_dmask);
+	if (opts->allow_utime)
+		seq_printf(m, ",allow_utime=%04o", opts->allow_utime);
+	if (sbi->nls_disk)
+		seq_printf(m, ",codepage=%s", sbi->nls_disk->charset);
+	if (sbi->nls_io)
+		seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);
+	seq_printf(m, ",namecase=%u", opts->casesensitive);
+	if (opts->errors == EXFAT_ERRORS_CONT)
+		seq_puts(m, ",errors=continue");
+	else if (opts->errors == EXFAT_ERRORS_PANIC)
+		seq_puts(m, ",errors=panic");
+	else
+		seq_puts(m, ",errors=remount-ro");
+#ifdef CONFIG_EXFAT_DISCARD
+	if (opts->discard)
+		seq_printf(m, ",discard");
+#endif
+	return 0;
+}
+
+const struct super_operations exfat_sops = {
+	.alloc_inode   = exfat_alloc_inode,
+	.destroy_inode = exfat_destroy_inode,
+	.write_inode   = exfat_write_inode,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+	.delete_inode  = exfat_delete_inode,
+	.clear_inode   = exfat_clear_inode,
+#else
+	.evict_inode  = exfat_evict_inode,
+#endif
+	.put_super     = exfat_put_super,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	.write_super   = exfat_write_super,
+#endif
+	.sync_fs       = exfat_sync_fs,
+	.statfs        = exfat_statfs,
+	.remount_fs    = exfat_remount,
+	.show_options  = exfat_show_options,
+};
+
+/*======================================================================*/
+/*  Super Block Read Operations                                         */
+/*======================================================================*/
+
+enum {
+	Opt_uid,
+	Opt_gid,
+	Opt_umask,
+	Opt_dmask,
+	Opt_fmask,
+	Opt_allow_utime,
+	Opt_codepage,
+	Opt_charset,
+	Opt_namecase,
+	Opt_debug,
+	Opt_err_cont,
+	Opt_err_panic,
+	Opt_err_ro,
+	Opt_utf8_hack,
+	Opt_err,
+#ifdef CONFIG_EXFAT_DISCARD
+	Opt_discard,
+#endif /* EXFAT_CONFIG_DISCARD */
+};
+
+static const match_table_t exfat_tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%o"},
+	{Opt_dmask, "dmask=%o"},
+	{Opt_fmask, "fmask=%o"},
+	{Opt_allow_utime, "allow_utime=%o"},
+	{Opt_codepage, "codepage=%u"},
+	{Opt_charset, "iocharset=%s"},
+	{Opt_namecase, "namecase=%u"},
+	{Opt_debug, "debug"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_utf8_hack, "utf8"},
+#ifdef CONFIG_EXFAT_DISCARD
+	{Opt_discard, "discard"},
+#endif /* CONFIG_EXFAT_DISCARD */
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, int silent, int *debug,
+						 struct exfat_mount_options *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	char *iocharset;
+
+	opts->fs_uid = current_uid();
+	opts->fs_gid = current_gid();
+	opts->fs_fmask = opts->fs_dmask = current->fs->umask;
+	opts->allow_utime = (unsigned short) -1;
+	opts->codepage = exfat_default_codepage;
+	opts->iocharset = exfat_default_iocharset;
+	opts->casesensitive = 0;
+	opts->errors = EXFAT_ERRORS_RO;
+#ifdef CONFIG_EXFAT_DISCARD
+	opts->discard = 0;
+#endif
+	*debug = 0;
+
+	if (!options)
+		goto out;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, exfat_tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+			opts->fs_uid = KUIDT_INIT(option);
+#else
+			opts->fs_uid = option;
+#endif
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+			opts->fs_gid = KGIDT_INIT(option);
+#else
+			opts->fs_gid = option;
+#endif
+			break;
+		case Opt_umask:
+		case Opt_dmask:
+		case Opt_fmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			if (token != Opt_dmask)
+				opts->fs_fmask = option;
+			if (token != Opt_fmask)
+				opts->fs_dmask = option;
+			break;
+		case Opt_allow_utime:
+			if (match_octal(&args[0], &option))
+				return 0;
+			opts->allow_utime = option & (S_IWGRP | S_IWOTH);
+			break;
+		case Opt_codepage:
+			if (match_int(&args[0], &option))
+				return 0;
+			opts->codepage = option;
+			break;
+		case Opt_charset:
+			if (opts->iocharset != exfat_default_iocharset)
+				kfree(opts->iocharset);
+			iocharset = match_strdup(&args[0]);
+			if (!iocharset)
+				return -ENOMEM;
+			opts->iocharset = iocharset;
+			break;
+		case Opt_namecase:
+			if (match_int(&args[0], &option))
+				return 0;
+			opts->casesensitive = option;
+			break;
+		case Opt_err_cont:
+			opts->errors = EXFAT_ERRORS_CONT;
+			break;
+		case Opt_err_panic:
+			opts->errors = EXFAT_ERRORS_PANIC;
+			break;
+		case Opt_err_ro:
+			opts->errors = EXFAT_ERRORS_RO;
+			break;
+		case Opt_debug:
+			*debug = 1;
+			break;
+#ifdef CONFIG_EXFAT_DISCARD
+		case Opt_discard:
+			opts->discard = 1;
+			break;
+#endif /* CONFIG_EXFAT_DISCARD */
+		case Opt_utf8_hack:
+			break;
+		default:
+			if (!silent)
+				printk(KERN_ERR "[EXFAT] Unrecognized mount option %s or missing value\n", p);
+			return -EINVAL;
+		}
+	}
+
+out:
+	if (opts->allow_utime == (unsigned short) -1)
+		opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
+
+	return 0;
+}
+
+static void exfat_hash_init(struct super_block *sb)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	int i;
+
+	spin_lock_init(&sbi->inode_hash_lock);
+	for (i = 0; i < EXFAT_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
+}
+
+static int exfat_read_root(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	struct timespec ts;
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+	DIR_ENTRY_T info;
+
+	ts = CURRENT_TIME_SEC;
+
+	EXFAT_I(inode)->fid.dir.dir = p_fs->root_dir;
+	EXFAT_I(inode)->fid.dir.flags = 0x01;
+	EXFAT_I(inode)->fid.entry = -1;
+	EXFAT_I(inode)->fid.start_clu = p_fs->root_dir;
+	EXFAT_I(inode)->fid.flags = 0x01;
+	EXFAT_I(inode)->fid.type = TYPE_DIR;
+	EXFAT_I(inode)->fid.rwoffset = 0;
+	EXFAT_I(inode)->fid.hint_last_off = -1;
+
+	EXFAT_I(inode)->target = NULL;
+
+	FsReadStat(inode, &info);
+
+	inode->i_uid = sbi->options.fs_uid;
+	inode->i_gid = sbi->options.fs_gid;
+	inode->i_version++;
+	inode->i_generation = 0;
+	inode->i_mode = exfat_make_mode(sbi, ATTR_SUBDIR, S_IRWXUGO);
+	inode->i_op = &exfat_dir_inode_operations;
+	inode->i_fop = &exfat_dir_operations;
+
+	i_size_write(inode, info.Size);
+	inode->i_blocks = ((i_size_read(inode) + (p_fs->cluster_size - 1))
+					   & ~((loff_t)p_fs->cluster_size - 1)) >> 9;
+	EXFAT_I(inode)->i_pos = ((loff_t) p_fs->root_dir << 32) | 0xffffffff;
+	EXFAT_I(inode)->mmu_private = i_size_read(inode);
+
+	exfat_save_attr(inode, ATTR_SUBDIR);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,00)
+	set_nlink(inode, info.NumSubdirs + 2);
+#else
+	inode->i_nlink = info.NumSubdirs + 2;
+#endif
+
+	return 0;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,37)
+static void setup_dops(struct super_block *sb)
+{
+	if (EXFAT_SB(sb)->options.casesensitive == 0)
+		sb->s_d_op = &exfat_ci_dentry_ops;
+	else
+		sb->s_d_op = &exfat_dentry_ops;
+}
+#endif
+
+static int exfat_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *root_inode = NULL;
+	struct exfat_sb_info *sbi;
+	int debug, ret;
+	long error;
+	char buf[50];
+
+	/*
+	 * GFP_KERNEL is ok here, because while we do hold the
+	 * supeblock lock, memory pressure can't call back into
+	 * the filesystem, since we're only just about to mount
+	 * it and have no inodes etc active!
+	 */
+	sbi = kzalloc(sizeof(struct exfat_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,0)
+	mutex_init(&sbi->s_lock);
+#endif
+	sb->s_fs_info = sbi;
+	sb->s_flags |= MS_NODIRATIME;
+	sb->s_magic = EXFAT_SUPER_MAGIC;
+	sb->s_op = &exfat_sops;
+
+	error = parse_options(data, silent, &debug, &sbi->options);
+	if (error)
+		goto out_fail;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,37)
+	setup_dops(sb);
+#endif
+
+	error = -EIO;
+	sb_min_blocksize(sb, 512);
+	sb->s_maxbytes = 0x7fffffffffffffffLL;    /* maximum file size */
+
+	ret = FsMountVol(sb);
+	if (ret) {
+		if (!silent)
+			printk(KERN_ERR "[EXFAT] FsMountVol failed\n");
+
+		goto out_fail;
+	}
+
+	/* set up enough so that it can read an inode */
+	exfat_hash_init(sb);
+
+	/*
+	 * The low byte of FAT's first entry must have same value with
+	 * media-field.  But in real world, too many devices is
+	 * writing wrong value.  So, removed that validity check.
+	 *
+	 * if (FAT_FIRST_ENT(sb, media) != first)
+	 */
+
+	/* codepage is not meaningful in exfat */
+	if (sbi->fs_info.vol_type != EXFAT) {
+		error = -EINVAL;
+		sprintf(buf, "cp%d", sbi->options.codepage);
+		sbi->nls_disk = load_nls(buf);
+		if (!sbi->nls_disk) {
+			printk(KERN_ERR "[EXFAT] Codepage %s not found\n", buf);
+			goto out_fail2;
+		}
+	}
+
+	sbi->nls_io = load_nls(sbi->options.iocharset);
+
+	error = -ENOMEM;
+	root_inode = new_inode(sb);
+	if (!root_inode)
+		goto out_fail2;
+	root_inode->i_ino = EXFAT_ROOT_INO;
+	root_inode->i_version = 1;
+	error = exfat_read_root(root_inode);
+	if (error < 0)
+		goto out_fail2;
+	error = -ENOMEM;
+	exfat_attach(root_inode, EXFAT_I(root_inode)->i_pos);
+	insert_inode_hash(root_inode);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,00)
+	sb->s_root = d_make_root(root_inode);
+#else
+	sb->s_root = d_alloc_root(root_inode);
+#endif
+	if (!sb->s_root) {
+		printk(KERN_ERR "[EXFAT] Getting the root inode failed\n");
+		goto out_fail2;
+	}
+
+	return 0;
+
+out_fail2:
+	FsUmountVol(sb);
+out_fail:
+	if (root_inode)
+		iput(root_inode);
+	sb->s_fs_info = NULL;
+	exfat_free_super(sbi);
+	return error;
+}
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+static int exfat_get_sb(struct file_system_type *fs_type,
+						int flags, const char *dev_name,
+						void *data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, exfat_fill_super, mnt);
+}
+#else
+static struct dentry *exfat_fs_mount(struct file_system_type *fs_type,
+									 int flags, const char *dev_name,
+									 void *data) {
+	return mount_bdev(fs_type, flags, dev_name, data, exfat_fill_super);
+}
+#endif
+
+static void init_once(void *foo)
+{
+	struct exfat_inode_info *ei = (struct exfat_inode_info *)foo;
+
+	INIT_HLIST_NODE(&ei->i_hash_fat);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init exfat_init_inodecache(void)
+{
+	exfat_inode_cachep = kmem_cache_create("exfat_inode_cache",
+										   sizeof(struct exfat_inode_info),
+										   0, (SLAB_RECLAIM_ACCOUNT|
+												   SLAB_MEM_SPREAD),
+										   init_once);
+	if (exfat_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void __exit exfat_destroy_inodecache(void)
+{
+	kmem_cache_destroy(exfat_inode_cachep);
+}
+
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+static void exfat_debug_kill_sb(struct super_block *sb)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	struct block_device *bdev = sb->s_bdev;
+
+	long flags;
+
+	if (sbi) {
+		flags = sbi->debug_flags;
+
+		if (flags & EXFAT_DEBUGFLAGS_INVALID_UMOUNT) {
+			/* invalidate_bdev drops all device cache include dirty.
+			   we use this to simulate device removal */
+			FsReleaseCache(sb);
+			invalidate_bdev(bdev);
+		}
+	}
+
+	kill_block_super(sb);
+}
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+static struct file_system_type exfat_fs_type = {
+	.owner       = THIS_MODULE,
+#if defined(CONFIG_MACH_LGE) || defined(CONFIG_HTC_BATT_CORE)
+	.name        = "texfat",
+#else
+	.name        = "exfat",
+#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+	.get_sb      = exfat_get_sb,
+#else
+	.mount       = exfat_fs_mount,
+#endif
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	.kill_sb    = exfat_debug_kill_sb,
+#else
+	.kill_sb    = kill_block_super,
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+	.fs_flags    = FS_REQUIRES_DEV,
+};
+
+static int __init init_exfat(void)
+{
+	int err;
+
+	err = FsInit();
+	if (err) {
+		if (err == FFS_MEMORYERR)
+			return -ENOMEM;
+		else
+			return -EIO;
+	}
+
+	printk(KERN_INFO "exFAT: Version %s\n", EXFAT_VERSION);
+
+	err = exfat_init_inodecache();
+	if (err)
+		goto out;
+
+	err = register_filesystem(&exfat_fs_type);
+	if (err)
+		goto out;
+
+	return 0;
+out:
+	FsShutdown();
+	return err;
+}
+
+static void __exit exit_exfat(void)
+{
+	exfat_destroy_inodecache();
+	unregister_filesystem(&exfat_fs_type);
+	FsShutdown();
+}
+
+module_init(init_exfat);
+module_exit(exit_exfat);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("exFAT Filesystem Driver");
+#ifdef MODULE_ALIAS_FS
+#if defined(CONFIG_MACH_LGE) || defined(CONFIG_HTC_BATT_CORE)
+MODULE_ALIAS_FS("texfat");
+#else
+MODULE_ALIAS_FS("exfat");
+#endif
+#endif
diff --git b/fs/exfat/exfat_super.h b/fs/exfat/exfat_super.h
new file mode 100644
index 0000000..916811e
--- /dev/null
+++ b/fs/exfat/exfat_super.h
@@ -0,0 +1,171 @@
+/* Some of the source code in this file came from "linux/fs/fat/fat.h".  */
+
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef _EXFAT_LINUX_H
+#define _EXFAT_LINUX_H
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/swap.h>
+
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+
+#include "exfat_blkdev.h"
+#include "exfat_cache.h"
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_core.h"
+
+#define EXFAT_ERRORS_CONT  1    /* ignore error and continue */
+#define EXFAT_ERRORS_PANIC 2    /* panic on error */
+#define EXFAT_ERRORS_RO    3    /* remount r/o on error */
+
+/* ioctl command */
+#define EXFAT_IOCTL_GET_VOLUME_ID _IOR('r', 0x12, __u32)
+
+struct exfat_mount_options {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+	kuid_t fs_uid;
+	kgid_t fs_gid;
+#else
+	uid_t fs_uid;
+	gid_t fs_gid;
+#endif
+	unsigned short fs_fmask;
+	unsigned short fs_dmask;
+	unsigned short allow_utime; /* permission for setting the [am]time */
+	unsigned short codepage;    /* codepage for shortname conversions */
+	char *iocharset;            /* charset for filename input/display */
+	unsigned char casesensitive;
+	unsigned char errors;       /* on error: continue, panic, remount-ro */
+#ifdef CONFIG_EXFAT_DISCARD
+	unsigned char discard;      /* flag on if -o dicard specified and device support discard() */
+#endif /* CONFIG_EXFAT_DISCARD */
+};
+
+#define EXFAT_HASH_BITS    8
+#define EXFAT_HASH_SIZE    (1UL << EXFAT_HASH_BITS)
+
+/*
+ * EXFAT file system in-core superblock data
+ */
+struct exfat_sb_info {
+	FS_INFO_T fs_info;
+	BD_INFO_T bd_info;
+
+	struct exfat_mount_options options;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+	int s_dirt;
+	struct mutex s_lock;
+#endif
+	struct nls_table *nls_disk; /* Codepage used on disk */
+	struct nls_table *nls_io;   /* Charset used for input and display */
+
+	struct inode *fat_inode;
+
+	spinlock_t inode_hash_lock;
+	struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	long debug_flags;
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+};
+
+/*
+ * EXFAT file system inode data in memory
+ */
+struct exfat_inode_info {
+	FILE_ID_T fid;
+	char  *target;
+	/* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
+	loff_t mmu_private;         /* physically allocated size */
+	loff_t i_pos;               /* on-disk position of directory entry or 0 */
+	struct hlist_node i_hash_fat;	/* hash by i_location */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,00)
+	struct rw_semaphore truncate_lock;
+#endif
+	struct inode vfs_inode;
+	struct rw_semaphore i_alloc_sem; /* protect bmap against truncate */
+};
+
+#define EXFAT_SB(sb)		((struct exfat_sb_info *)((sb)->s_fs_info))
+
+static inline struct exfat_inode_info *EXFAT_I(struct inode *inode)
+{
+	return container_of(inode, struct exfat_inode_info, vfs_inode);
+}
+
+/*
+ * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
+ * save ATTR_RO instead of ->i_mode.
+ *
+ * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
+ * bit, it's just used as flag for app.
+ */
+static inline int exfat_mode_can_hold_ro(struct inode *inode)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+
+	if (S_ISDIR(inode->i_mode))
+		return 0;
+
+	if ((~sbi->options.fs_fmask) & S_IWUGO)
+		return 1;
+	return 0;
+}
+
+/* Convert attribute bits and a mask to the UNIX mode. */
+static inline mode_t exfat_make_mode(struct exfat_sb_info *sbi,
+									 u32 attr, mode_t mode)
+{
+	if ((attr & ATTR_READONLY) && !(attr & ATTR_SUBDIR))
+		mode &= ~S_IWUGO;
+
+	if (attr & ATTR_SUBDIR)
+		return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
+	else if (attr & ATTR_SYMLINK)
+		return (mode & ~sbi->options.fs_dmask) | S_IFLNK;
+	else
+		return (mode & ~sbi->options.fs_fmask) | S_IFREG;
+}
+
+/* Return the FAT attribute byte for this inode */
+static inline u32 exfat_make_attr(struct inode *inode)
+{
+	if (exfat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO))
+		return (EXFAT_I(inode)->fid.attr) | ATTR_READONLY;
+	else
+		return EXFAT_I(inode)->fid.attr;
+}
+
+static inline void exfat_save_attr(struct inode *inode, u32 attr)
+{
+	if (exfat_mode_can_hold_ro(inode))
+		EXFAT_I(inode)->fid.attr = attr & ATTR_RWMASK;
+	else
+		EXFAT_I(inode)->fid.attr = attr & (ATTR_RWMASK | ATTR_READONLY);
+}
+
+#endif /* _EXFAT_LINUX_H */
diff --git b/fs/exfat/exfat_upcase.c b/fs/exfat/exfat_upcase.c
new file mode 100644
index 0000000..3807f37
--- /dev/null
+++ b/fs/exfat/exfat_upcase.c
@@ -0,0 +1,405 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_upcase.c                                            */
+/*  PURPOSE : exFAT Up-case Table                                       */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include "exfat_config.h"
+
+#include "exfat_nls.h"
+
+const u8 uni_upcase[NUM_UPCASE<<1] = {
+	0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04, 0x00, 0x05, 0x00, 0x06, 0x00, 0x07, 0x00,
+	0x08, 0x00, 0x09, 0x00, 0x0A, 0x00, 0x0B, 0x00, 0x0C, 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0F, 0x00,
+	0x10, 0x00, 0x11, 0x00, 0x12, 0x00, 0x13, 0x00, 0x14, 0x00, 0x15, 0x00, 0x16, 0x00, 0x17, 0x00,
+	0x18, 0x00, 0x19, 0x00, 0x1A, 0x00, 0x1B, 0x00, 0x1C, 0x00, 0x1D, 0x00, 0x1E, 0x00, 0x1F, 0x00,
+	0x20, 0x00, 0x21, 0x00, 0x22, 0x00, 0x23, 0x00, 0x24, 0x00, 0x25, 0x00, 0x26, 0x00, 0x27, 0x00,
+	0x28, 0x00, 0x29, 0x00, 0x2A, 0x00, 0x2B, 0x00, 0x2C, 0x00, 0x2D, 0x00, 0x2E, 0x00, 0x2F, 0x00,
+	0x30, 0x00, 0x31, 0x00, 0x32, 0x00, 0x33, 0x00, 0x34, 0x00, 0x35, 0x00, 0x36, 0x00, 0x37, 0x00,
+	0x38, 0x00, 0x39, 0x00, 0x3A, 0x00, 0x3B, 0x00, 0x3C, 0x00, 0x3D, 0x00, 0x3E, 0x00, 0x3F, 0x00,
+	0x40, 0x00, 0x41, 0x00, 0x42, 0x00, 0x43, 0x00, 0x44, 0x00, 0x45, 0x00, 0x46, 0x00, 0x47, 0x00,
+	0x48, 0x00, 0x49, 0x00, 0x4A, 0x00, 0x4B, 0x00, 0x4C, 0x00, 0x4D, 0x00, 0x4E, 0x00, 0x4F, 0x00,
+	0x50, 0x00, 0x51, 0x00, 0x52, 0x00, 0x53, 0x00, 0x54, 0x00, 0x55, 0x00, 0x56, 0x00, 0x57, 0x00,
+	0x58, 0x00, 0x59, 0x00, 0x5A, 0x00, 0x5B, 0x00, 0x5C, 0x00, 0x5D, 0x00, 0x5E, 0x00, 0x5F, 0x00,
+	0x60, 0x00, 0x41, 0x00, 0x42, 0x00, 0x43, 0x00, 0x44, 0x00, 0x45, 0x00, 0x46, 0x00, 0x47, 0x00,
+	0x48, 0x00, 0x49, 0x00, 0x4A, 0x00, 0x4B, 0x00, 0x4C, 0x00, 0x4D, 0x00, 0x4E, 0x00, 0x4F, 0x00,
+	0x50, 0x00, 0x51, 0x00, 0x52, 0x00, 0x53, 0x00, 0x54, 0x00, 0x55, 0x00, 0x56, 0x00, 0x57, 0x00,
+	0x58, 0x00, 0x59, 0x00, 0x5A, 0x00, 0x7B, 0x00, 0x7C, 0x00, 0x7D, 0x00, 0x7E, 0x00, 0x7F, 0x00,
+	0x80, 0x00, 0x81, 0x00, 0x82, 0x00, 0x83, 0x00, 0x84, 0x00, 0x85, 0x00, 0x86, 0x00, 0x87, 0x00,
+	0x88, 0x00, 0x89, 0x00, 0x8A, 0x00, 0x8B, 0x00, 0x8C, 0x00, 0x8D, 0x00, 0x8E, 0x00, 0x8F, 0x00,
+	0x90, 0x00, 0x91, 0x00, 0x92, 0x00, 0x93, 0x00, 0x94, 0x00, 0x95, 0x00, 0x96, 0x00, 0x97, 0x00,
+	0x98, 0x00, 0x99, 0x00, 0x9A, 0x00, 0x9B, 0x00, 0x9C, 0x00, 0x9D, 0x00, 0x9E, 0x00, 0x9F, 0x00,
+	0xA0, 0x00, 0xA1, 0x00, 0xA2, 0x00, 0xA3, 0x00, 0xA4, 0x00, 0xA5, 0x00, 0xA6, 0x00, 0xA7, 0x00,
+	0xA8, 0x00, 0xA9, 0x00, 0xAA, 0x00, 0xAB, 0x00, 0xAC, 0x00, 0xAD, 0x00, 0xAE, 0x00, 0xAF, 0x00,
+	0xB0, 0x00, 0xB1, 0x00, 0xB2, 0x00, 0xB3, 0x00, 0xB4, 0x00, 0xB5, 0x00, 0xB6, 0x00, 0xB7, 0x00,
+	0xB8, 0x00, 0xB9, 0x00, 0xBA, 0x00, 0xBB, 0x00, 0xBC, 0x00, 0xBD, 0x00, 0xBE, 0x00, 0xBF, 0x00,
+	0xC0, 0x00, 0xC1, 0x00, 0xC2, 0x00, 0xC3, 0x00, 0xC4, 0x00, 0xC5, 0x00, 0xC6, 0x00, 0xC7, 0x00,
+	0xC8, 0x00, 0xC9, 0x00, 0xCA, 0x00, 0xCB, 0x00, 0xCC, 0x00, 0xCD, 0x00, 0xCE, 0x00, 0xCF, 0x00,
+	0xD0, 0x00, 0xD1, 0x00, 0xD2, 0x00, 0xD3, 0x00, 0xD4, 0x00, 0xD5, 0x00, 0xD6, 0x00, 0xD7, 0x00,
+	0xD8, 0x00, 0xD9, 0x00, 0xDA, 0x00, 0xDB, 0x00, 0xDC, 0x00, 0xDD, 0x00, 0xDE, 0x00, 0xDF, 0x00,
+	0xC0, 0x00, 0xC1, 0x00, 0xC2, 0x00, 0xC3, 0x00, 0xC4, 0x00, 0xC5, 0x00, 0xC6, 0x00, 0xC7, 0x00,
+	0xC8, 0x00, 0xC9, 0x00, 0xCA, 0x00, 0xCB, 0x00, 0xCC, 0x00, 0xCD, 0x00, 0xCE, 0x00, 0xCF, 0x00,
+	0xD0, 0x00, 0xD1, 0x00, 0xD2, 0x00, 0xD3, 0x00, 0xD4, 0x00, 0xD5, 0x00, 0xD6, 0x00, 0xF7, 0x00,
+	0xD8, 0x00, 0xD9, 0x00, 0xDA, 0x00, 0xDB, 0x00, 0xDC, 0x00, 0xDD, 0x00, 0xDE, 0x00, 0x78, 0x01,
+	0x00, 0x01, 0x00, 0x01, 0x02, 0x01, 0x02, 0x01, 0x04, 0x01, 0x04, 0x01, 0x06, 0x01, 0x06, 0x01,
+	0x08, 0x01, 0x08, 0x01, 0x0A, 0x01, 0x0A, 0x01, 0x0C, 0x01, 0x0C, 0x01, 0x0E, 0x01, 0x0E, 0x01,
+	0x10, 0x01, 0x10, 0x01, 0x12, 0x01, 0x12, 0x01, 0x14, 0x01, 0x14, 0x01, 0x16, 0x01, 0x16, 0x01,
+	0x18, 0x01, 0x18, 0x01, 0x1A, 0x01, 0x1A, 0x01, 0x1C, 0x01, 0x1C, 0x01, 0x1E, 0x01, 0x1E, 0x01,
+	0x20, 0x01, 0x20, 0x01, 0x22, 0x01, 0x22, 0x01, 0x24, 0x01, 0x24, 0x01, 0x26, 0x01, 0x26, 0x01,
+	0x28, 0x01, 0x28, 0x01, 0x2A, 0x01, 0x2A, 0x01, 0x2C, 0x01, 0x2C, 0x01, 0x2E, 0x01, 0x2E, 0x01,
+	0x30, 0x01, 0x31, 0x01, 0x32, 0x01, 0x32, 0x01, 0x34, 0x01, 0x34, 0x01, 0x36, 0x01, 0x36, 0x01,
+	0x38, 0x01, 0x39, 0x01, 0x39, 0x01, 0x3B, 0x01, 0x3B, 0x01, 0x3D, 0x01, 0x3D, 0x01, 0x3F, 0x01,
+	0x3F, 0x01, 0x41, 0x01, 0x41, 0x01, 0x43, 0x01, 0x43, 0x01, 0x45, 0x01, 0x45, 0x01, 0x47, 0x01,
+	0x47, 0x01, 0x49, 0x01, 0x4A, 0x01, 0x4A, 0x01, 0x4C, 0x01, 0x4C, 0x01, 0x4E, 0x01, 0x4E, 0x01,
+	0x50, 0x01, 0x50, 0x01, 0x52, 0x01, 0x52, 0x01, 0x54, 0x01, 0x54, 0x01, 0x56, 0x01, 0x56, 0x01,
+	0x58, 0x01, 0x58, 0x01, 0x5A, 0x01, 0x5A, 0x01, 0x5C, 0x01, 0x5C, 0x01, 0x5E, 0x01, 0x5E, 0x01,
+	0x60, 0x01, 0x60, 0x01, 0x62, 0x01, 0x62, 0x01, 0x64, 0x01, 0x64, 0x01, 0x66, 0x01, 0x66, 0x01,
+	0x68, 0x01, 0x68, 0x01, 0x6A, 0x01, 0x6A, 0x01, 0x6C, 0x01, 0x6C, 0x01, 0x6E, 0x01, 0x6E, 0x01,
+	0x70, 0x01, 0x70, 0x01, 0x72, 0x01, 0x72, 0x01, 0x74, 0x01, 0x74, 0x01, 0x76, 0x01, 0x76, 0x01,
+	0x78, 0x01, 0x79, 0x01, 0x79, 0x01, 0x7B, 0x01, 0x7B, 0x01, 0x7D, 0x01, 0x7D, 0x01, 0x7F, 0x01,
+	0x43, 0x02, 0x81, 0x01, 0x82, 0x01, 0x82, 0x01, 0x84, 0x01, 0x84, 0x01, 0x86, 0x01, 0x87, 0x01,
+	0x87, 0x01, 0x89, 0x01, 0x8A, 0x01, 0x8B, 0x01, 0x8B, 0x01, 0x8D, 0x01, 0x8E, 0x01, 0x8F, 0x01,
+	0x90, 0x01, 0x91, 0x01, 0x91, 0x01, 0x93, 0x01, 0x94, 0x01, 0xF6, 0x01, 0x96, 0x01, 0x97, 0x01,
+	0x98, 0x01, 0x98, 0x01, 0x3D, 0x02, 0x9B, 0x01, 0x9C, 0x01, 0x9D, 0x01, 0x20, 0x02, 0x9F, 0x01,
+	0xA0, 0x01, 0xA0, 0x01, 0xA2, 0x01, 0xA2, 0x01, 0xA4, 0x01, 0xA4, 0x01, 0xA6, 0x01, 0xA7, 0x01,
+	0xA7, 0x01, 0xA9, 0x01, 0xAA, 0x01, 0xAB, 0x01, 0xAC, 0x01, 0xAC, 0x01, 0xAE, 0x01, 0xAF, 0x01,
+	0xAF, 0x01, 0xB1, 0x01, 0xB2, 0x01, 0xB3, 0x01, 0xB3, 0x01, 0xB5, 0x01, 0xB5, 0x01, 0xB7, 0x01,
+	0xB8, 0x01, 0xB8, 0x01, 0xBA, 0x01, 0xBB, 0x01, 0xBC, 0x01, 0xBC, 0x01, 0xBE, 0x01, 0xF7, 0x01,
+	0xC0, 0x01, 0xC1, 0x01, 0xC2, 0x01, 0xC3, 0x01, 0xC4, 0x01, 0xC5, 0x01, 0xC4, 0x01, 0xC7, 0x01,
+	0xC8, 0x01, 0xC7, 0x01, 0xCA, 0x01, 0xCB, 0x01, 0xCA, 0x01, 0xCD, 0x01, 0xCD, 0x01, 0xCF, 0x01,
+	0xCF, 0x01, 0xD1, 0x01, 0xD1, 0x01, 0xD3, 0x01, 0xD3, 0x01, 0xD5, 0x01, 0xD5, 0x01, 0xD7, 0x01,
+	0xD7, 0x01, 0xD9, 0x01, 0xD9, 0x01, 0xDB, 0x01, 0xDB, 0x01, 0x8E, 0x01, 0xDE, 0x01, 0xDE, 0x01,
+	0xE0, 0x01, 0xE0, 0x01, 0xE2, 0x01, 0xE2, 0x01, 0xE4, 0x01, 0xE4, 0x01, 0xE6, 0x01, 0xE6, 0x01,
+	0xE8, 0x01, 0xE8, 0x01, 0xEA, 0x01, 0xEA, 0x01, 0xEC, 0x01, 0xEC, 0x01, 0xEE, 0x01, 0xEE, 0x01,
+	0xF0, 0x01, 0xF1, 0x01, 0xF2, 0x01, 0xF1, 0x01, 0xF4, 0x01, 0xF4, 0x01, 0xF6, 0x01, 0xF7, 0x01,
+	0xF8, 0x01, 0xF8, 0x01, 0xFA, 0x01, 0xFA, 0x01, 0xFC, 0x01, 0xFC, 0x01, 0xFE, 0x01, 0xFE, 0x01,
+	0x00, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x04, 0x02, 0x04, 0x02, 0x06, 0x02, 0x06, 0x02,
+	0x08, 0x02, 0x08, 0x02, 0x0A, 0x02, 0x0A, 0x02, 0x0C, 0x02, 0x0C, 0x02, 0x0E, 0x02, 0x0E, 0x02,
+	0x10, 0x02, 0x10, 0x02, 0x12, 0x02, 0x12, 0x02, 0x14, 0x02, 0x14, 0x02, 0x16, 0x02, 0x16, 0x02,
+	0x18, 0x02, 0x18, 0x02, 0x1A, 0x02, 0x1A, 0x02, 0x1C, 0x02, 0x1C, 0x02, 0x1E, 0x02, 0x1E, 0x02,
+	0x20, 0x02, 0x21, 0x02, 0x22, 0x02, 0x22, 0x02, 0x24, 0x02, 0x24, 0x02, 0x26, 0x02, 0x26, 0x02,
+	0x28, 0x02, 0x28, 0x02, 0x2A, 0x02, 0x2A, 0x02, 0x2C, 0x02, 0x2C, 0x02, 0x2E, 0x02, 0x2E, 0x02,
+	0x30, 0x02, 0x30, 0x02, 0x32, 0x02, 0x32, 0x02, 0x34, 0x02, 0x35, 0x02, 0x36, 0x02, 0x37, 0x02,
+	0x38, 0x02, 0x39, 0x02, 0x65, 0x2C, 0x3B, 0x02, 0x3B, 0x02, 0x3D, 0x02, 0x66, 0x2C, 0x3F, 0x02,
+	0x40, 0x02, 0x41, 0x02, 0x41, 0x02, 0x43, 0x02, 0x44, 0x02, 0x45, 0x02, 0x46, 0x02, 0x46, 0x02,
+	0x48, 0x02, 0x48, 0x02, 0x4A, 0x02, 0x4A, 0x02, 0x4C, 0x02, 0x4C, 0x02, 0x4E, 0x02, 0x4E, 0x02,
+	0x50, 0x02, 0x51, 0x02, 0x52, 0x02, 0x81, 0x01, 0x86, 0x01, 0x55, 0x02, 0x89, 0x01, 0x8A, 0x01,
+	0x58, 0x02, 0x8F, 0x01, 0x5A, 0x02, 0x90, 0x01, 0x5C, 0x02, 0x5D, 0x02, 0x5E, 0x02, 0x5F, 0x02,
+	0x93, 0x01, 0x61, 0x02, 0x62, 0x02, 0x94, 0x01, 0x64, 0x02, 0x65, 0x02, 0x66, 0x02, 0x67, 0x02,
+	0x97, 0x01, 0x96, 0x01, 0x6A, 0x02, 0x62, 0x2C, 0x6C, 0x02, 0x6D, 0x02, 0x6E, 0x02, 0x9C, 0x01,
+	0x70, 0x02, 0x71, 0x02, 0x9D, 0x01, 0x73, 0x02, 0x74, 0x02, 0x9F, 0x01, 0x76, 0x02, 0x77, 0x02,
+	0x78, 0x02, 0x79, 0x02, 0x7A, 0x02, 0x7B, 0x02, 0x7C, 0x02, 0x64, 0x2C, 0x7E, 0x02, 0x7F, 0x02,
+	0xA6, 0x01, 0x81, 0x02, 0x82, 0x02, 0xA9, 0x01, 0x84, 0x02, 0x85, 0x02, 0x86, 0x02, 0x87, 0x02,
+	0xAE, 0x01, 0x44, 0x02, 0xB1, 0x01, 0xB2, 0x01, 0x45, 0x02, 0x8D, 0x02, 0x8E, 0x02, 0x8F, 0x02,
+	0x90, 0x02, 0x91, 0x02, 0xB7, 0x01, 0x93, 0x02, 0x94, 0x02, 0x95, 0x02, 0x96, 0x02, 0x97, 0x02,
+	0x98, 0x02, 0x99, 0x02, 0x9A, 0x02, 0x9B, 0x02, 0x9C, 0x02, 0x9D, 0x02, 0x9E, 0x02, 0x9F, 0x02,
+	0xA0, 0x02, 0xA1, 0x02, 0xA2, 0x02, 0xA3, 0x02, 0xA4, 0x02, 0xA5, 0x02, 0xA6, 0x02, 0xA7, 0x02,
+	0xA8, 0x02, 0xA9, 0x02, 0xAA, 0x02, 0xAB, 0x02, 0xAC, 0x02, 0xAD, 0x02, 0xAE, 0x02, 0xAF, 0x02,
+	0xB0, 0x02, 0xB1, 0x02, 0xB2, 0x02, 0xB3, 0x02, 0xB4, 0x02, 0xB5, 0x02, 0xB6, 0x02, 0xB7, 0x02,
+	0xB8, 0x02, 0xB9, 0x02, 0xBA, 0x02, 0xBB, 0x02, 0xBC, 0x02, 0xBD, 0x02, 0xBE, 0x02, 0xBF, 0x02,
+	0xC0, 0x02, 0xC1, 0x02, 0xC2, 0x02, 0xC3, 0x02, 0xC4, 0x02, 0xC5, 0x02, 0xC6, 0x02, 0xC7, 0x02,
+	0xC8, 0x02, 0xC9, 0x02, 0xCA, 0x02, 0xCB, 0x02, 0xCC, 0x02, 0xCD, 0x02, 0xCE, 0x02, 0xCF, 0x02,
+	0xD0, 0x02, 0xD1, 0x02, 0xD2, 0x02, 0xD3, 0x02, 0xD4, 0x02, 0xD5, 0x02, 0xD6, 0x02, 0xD7, 0x02,
+	0xD8, 0x02, 0xD9, 0x02, 0xDA, 0x02, 0xDB, 0x02, 0xDC, 0x02, 0xDD, 0x02, 0xDE, 0x02, 0xDF, 0x02,
+	0xE0, 0x02, 0xE1, 0x02, 0xE2, 0x02, 0xE3, 0x02, 0xE4, 0x02, 0xE5, 0x02, 0xE6, 0x02, 0xE7, 0x02,
+	0xE8, 0x02, 0xE9, 0x02, 0xEA, 0x02, 0xEB, 0x02, 0xEC, 0x02, 0xED, 0x02, 0xEE, 0x02, 0xEF, 0x02,
+	0xF0, 0x02, 0xF1, 0x02, 0xF2, 0x02, 0xF3, 0x02, 0xF4, 0x02, 0xF5, 0x02, 0xF6, 0x02, 0xF7, 0x02,
+	0xF8, 0x02, 0xF9, 0x02, 0xFA, 0x02, 0xFB, 0x02, 0xFC, 0x02, 0xFD, 0x02, 0xFE, 0x02, 0xFF, 0x02,
+	0x00, 0x03, 0x01, 0x03, 0x02, 0x03, 0x03, 0x03, 0x04, 0x03, 0x05, 0x03, 0x06, 0x03, 0x07, 0x03,
+	0x08, 0x03, 0x09, 0x03, 0x0A, 0x03, 0x0B, 0x03, 0x0C, 0x03, 0x0D, 0x03, 0x0E, 0x03, 0x0F, 0x03,
+	0x10, 0x03, 0x11, 0x03, 0x12, 0x03, 0x13, 0x03, 0x14, 0x03, 0x15, 0x03, 0x16, 0x03, 0x17, 0x03,
+	0x18, 0x03, 0x19, 0x03, 0x1A, 0x03, 0x1B, 0x03, 0x1C, 0x03, 0x1D, 0x03, 0x1E, 0x03, 0x1F, 0x03,
+	0x20, 0x03, 0x21, 0x03, 0x22, 0x03, 0x23, 0x03, 0x24, 0x03, 0x25, 0x03, 0x26, 0x03, 0x27, 0x03,
+	0x28, 0x03, 0x29, 0x03, 0x2A, 0x03, 0x2B, 0x03, 0x2C, 0x03, 0x2D, 0x03, 0x2E, 0x03, 0x2F, 0x03,
+	0x30, 0x03, 0x31, 0x03, 0x32, 0x03, 0x33, 0x03, 0x34, 0x03, 0x35, 0x03, 0x36, 0x03, 0x37, 0x03,
+	0x38, 0x03, 0x39, 0x03, 0x3A, 0x03, 0x3B, 0x03, 0x3C, 0x03, 0x3D, 0x03, 0x3E, 0x03, 0x3F, 0x03,
+	0x40, 0x03, 0x41, 0x03, 0x42, 0x03, 0x43, 0x03, 0x44, 0x03, 0x45, 0x03, 0x46, 0x03, 0x47, 0x03,
+	0x48, 0x03, 0x49, 0x03, 0x4A, 0x03, 0x4B, 0x03, 0x4C, 0x03, 0x4D, 0x03, 0x4E, 0x03, 0x4F, 0x03,
+	0x50, 0x03, 0x51, 0x03, 0x52, 0x03, 0x53, 0x03, 0x54, 0x03, 0x55, 0x03, 0x56, 0x03, 0x57, 0x03,
+	0x58, 0x03, 0x59, 0x03, 0x5A, 0x03, 0x5B, 0x03, 0x5C, 0x03, 0x5D, 0x03, 0x5E, 0x03, 0x5F, 0x03,
+	0x60, 0x03, 0x61, 0x03, 0x62, 0x03, 0x63, 0x03, 0x64, 0x03, 0x65, 0x03, 0x66, 0x03, 0x67, 0x03,
+	0x68, 0x03, 0x69, 0x03, 0x6A, 0x03, 0x6B, 0x03, 0x6C, 0x03, 0x6D, 0x03, 0x6E, 0x03, 0x6F, 0x03,
+	0x70, 0x03, 0x71, 0x03, 0x72, 0x03, 0x73, 0x03, 0x74, 0x03, 0x75, 0x03, 0x76, 0x03, 0x77, 0x03,
+	0x78, 0x03, 0x79, 0x03, 0x7A, 0x03, 0xFD, 0x03, 0xFE, 0x03, 0xFF, 0x03, 0x7E, 0x03, 0x7F, 0x03,
+	0x80, 0x03, 0x81, 0x03, 0x82, 0x03, 0x83, 0x03, 0x84, 0x03, 0x85, 0x03, 0x86, 0x03, 0x87, 0x03,
+	0x88, 0x03, 0x89, 0x03, 0x8A, 0x03, 0x8B, 0x03, 0x8C, 0x03, 0x8D, 0x03, 0x8E, 0x03, 0x8F, 0x03,
+	0x90, 0x03, 0x91, 0x03, 0x92, 0x03, 0x93, 0x03, 0x94, 0x03, 0x95, 0x03, 0x96, 0x03, 0x97, 0x03,
+	0x98, 0x03, 0x99, 0x03, 0x9A, 0x03, 0x9B, 0x03, 0x9C, 0x03, 0x9D, 0x03, 0x9E, 0x03, 0x9F, 0x03,
+	0xA0, 0x03, 0xA1, 0x03, 0xA2, 0x03, 0xA3, 0x03, 0xA4, 0x03, 0xA5, 0x03, 0xA6, 0x03, 0xA7, 0x03,
+	0xA8, 0x03, 0xA9, 0x03, 0xAA, 0x03, 0xAB, 0x03, 0x86, 0x03, 0x88, 0x03, 0x89, 0x03, 0x8A, 0x03,
+	0xB0, 0x03, 0x91, 0x03, 0x92, 0x03, 0x93, 0x03, 0x94, 0x03, 0x95, 0x03, 0x96, 0x03, 0x97, 0x03,
+	0x98, 0x03, 0x99, 0x03, 0x9A, 0x03, 0x9B, 0x03, 0x9C, 0x03, 0x9D, 0x03, 0x9E, 0x03, 0x9F, 0x03,
+	0xA0, 0x03, 0xA1, 0x03, 0xA3, 0x03, 0xA3, 0x03, 0xA4, 0x03, 0xA5, 0x03, 0xA6, 0x03, 0xA7, 0x03,
+	0xA8, 0x03, 0xA9, 0x03, 0xAA, 0x03, 0xAB, 0x03, 0x8C, 0x03, 0x8E, 0x03, 0x8F, 0x03, 0xCF, 0x03,
+	0xD0, 0x03, 0xD1, 0x03, 0xD2, 0x03, 0xD3, 0x03, 0xD4, 0x03, 0xD5, 0x03, 0xD6, 0x03, 0xD7, 0x03,
+	0xD8, 0x03, 0xD8, 0x03, 0xDA, 0x03, 0xDA, 0x03, 0xDC, 0x03, 0xDC, 0x03, 0xDE, 0x03, 0xDE, 0x03,
+	0xE0, 0x03, 0xE0, 0x03, 0xE2, 0x03, 0xE2, 0x03, 0xE4, 0x03, 0xE4, 0x03, 0xE6, 0x03, 0xE6, 0x03,
+	0xE8, 0x03, 0xE8, 0x03, 0xEA, 0x03, 0xEA, 0x03, 0xEC, 0x03, 0xEC, 0x03, 0xEE, 0x03, 0xEE, 0x03,
+	0xF0, 0x03, 0xF1, 0x03, 0xF9, 0x03, 0xF3, 0x03, 0xF4, 0x03, 0xF5, 0x03, 0xF6, 0x03, 0xF7, 0x03,
+	0xF7, 0x03, 0xF9, 0x03, 0xFA, 0x03, 0xFA, 0x03, 0xFC, 0x03, 0xFD, 0x03, 0xFE, 0x03, 0xFF, 0x03,
+	0x00, 0x04, 0x01, 0x04, 0x02, 0x04, 0x03, 0x04, 0x04, 0x04, 0x05, 0x04, 0x06, 0x04, 0x07, 0x04,
+	0x08, 0x04, 0x09, 0x04, 0x0A, 0x04, 0x0B, 0x04, 0x0C, 0x04, 0x0D, 0x04, 0x0E, 0x04, 0x0F, 0x04,
+	0x10, 0x04, 0x11, 0x04, 0x12, 0x04, 0x13, 0x04, 0x14, 0x04, 0x15, 0x04, 0x16, 0x04, 0x17, 0x04,
+	0x18, 0x04, 0x19, 0x04, 0x1A, 0x04, 0x1B, 0x04, 0x1C, 0x04, 0x1D, 0x04, 0x1E, 0x04, 0x1F, 0x04,
+	0x20, 0x04, 0x21, 0x04, 0x22, 0x04, 0x23, 0x04, 0x24, 0x04, 0x25, 0x04, 0x26, 0x04, 0x27, 0x04,
+	0x28, 0x04, 0x29, 0x04, 0x2A, 0x04, 0x2B, 0x04, 0x2C, 0x04, 0x2D, 0x04, 0x2E, 0x04, 0x2F, 0x04,
+	0x10, 0x04, 0x11, 0x04, 0x12, 0x04, 0x13, 0x04, 0x14, 0x04, 0x15, 0x04, 0x16, 0x04, 0x17, 0x04,
+	0x18, 0x04, 0x19, 0x04, 0x1A, 0x04, 0x1B, 0x04, 0x1C, 0x04, 0x1D, 0x04, 0x1E, 0x04, 0x1F, 0x04,
+	0x20, 0x04, 0x21, 0x04, 0x22, 0x04, 0x23, 0x04, 0x24, 0x04, 0x25, 0x04, 0x26, 0x04, 0x27, 0x04,
+	0x28, 0x04, 0x29, 0x04, 0x2A, 0x04, 0x2B, 0x04, 0x2C, 0x04, 0x2D, 0x04, 0x2E, 0x04, 0x2F, 0x04,
+	0x00, 0x04, 0x01, 0x04, 0x02, 0x04, 0x03, 0x04, 0x04, 0x04, 0x05, 0x04, 0x06, 0x04, 0x07, 0x04,
+	0x08, 0x04, 0x09, 0x04, 0x0A, 0x04, 0x0B, 0x04, 0x0C, 0x04, 0x0D, 0x04, 0x0E, 0x04, 0x0F, 0x04,
+	0x60, 0x04, 0x60, 0x04, 0x62, 0x04, 0x62, 0x04, 0x64, 0x04, 0x64, 0x04, 0x66, 0x04, 0x66, 0x04,
+	0x68, 0x04, 0x68, 0x04, 0x6A, 0x04, 0x6A, 0x04, 0x6C, 0x04, 0x6C, 0x04, 0x6E, 0x04, 0x6E, 0x04,
+	0x70, 0x04, 0x70, 0x04, 0x72, 0x04, 0x72, 0x04, 0x74, 0x04, 0x74, 0x04, 0x76, 0x04, 0x76, 0x04,
+	0x78, 0x04, 0x78, 0x04, 0x7A, 0x04, 0x7A, 0x04, 0x7C, 0x04, 0x7C, 0x04, 0x7E, 0x04, 0x7E, 0x04,
+	0x80, 0x04, 0x80, 0x04, 0x82, 0x04, 0x83, 0x04, 0x84, 0x04, 0x85, 0x04, 0x86, 0x04, 0x87, 0x04,
+	0x88, 0x04, 0x89, 0x04, 0x8A, 0x04, 0x8A, 0x04, 0x8C, 0x04, 0x8C, 0x04, 0x8E, 0x04, 0x8E, 0x04,
+	0x90, 0x04, 0x90, 0x04, 0x92, 0x04, 0x92, 0x04, 0x94, 0x04, 0x94, 0x04, 0x96, 0x04, 0x96, 0x04,
+	0x98, 0x04, 0x98, 0x04, 0x9A, 0x04, 0x9A, 0x04, 0x9C, 0x04, 0x9C, 0x04, 0x9E, 0x04, 0x9E, 0x04,
+	0xA0, 0x04, 0xA0, 0x04, 0xA2, 0x04, 0xA2, 0x04, 0xA4, 0x04, 0xA4, 0x04, 0xA6, 0x04, 0xA6, 0x04,
+	0xA8, 0x04, 0xA8, 0x04, 0xAA, 0x04, 0xAA, 0x04, 0xAC, 0x04, 0xAC, 0x04, 0xAE, 0x04, 0xAE, 0x04,
+	0xB0, 0x04, 0xB0, 0x04, 0xB2, 0x04, 0xB2, 0x04, 0xB4, 0x04, 0xB4, 0x04, 0xB6, 0x04, 0xB6, 0x04,
+	0xB8, 0x04, 0xB8, 0x04, 0xBA, 0x04, 0xBA, 0x04, 0xBC, 0x04, 0xBC, 0x04, 0xBE, 0x04, 0xBE, 0x04,
+	0xC0, 0x04, 0xC1, 0x04, 0xC1, 0x04, 0xC3, 0x04, 0xC3, 0x04, 0xC5, 0x04, 0xC5, 0x04, 0xC7, 0x04,
+	0xC7, 0x04, 0xC9, 0x04, 0xC9, 0x04, 0xCB, 0x04, 0xCB, 0x04, 0xCD, 0x04, 0xCD, 0x04, 0xC0, 0x04,
+	0xD0, 0x04, 0xD0, 0x04, 0xD2, 0x04, 0xD2, 0x04, 0xD4, 0x04, 0xD4, 0x04, 0xD6, 0x04, 0xD6, 0x04,
+	0xD8, 0x04, 0xD8, 0x04, 0xDA, 0x04, 0xDA, 0x04, 0xDC, 0x04, 0xDC, 0x04, 0xDE, 0x04, 0xDE, 0x04,
+	0xE0, 0x04, 0xE0, 0x04, 0xE2, 0x04, 0xE2, 0x04, 0xE4, 0x04, 0xE4, 0x04, 0xE6, 0x04, 0xE6, 0x04,
+	0xE8, 0x04, 0xE8, 0x04, 0xEA, 0x04, 0xEA, 0x04, 0xEC, 0x04, 0xEC, 0x04, 0xEE, 0x04, 0xEE, 0x04,
+	0xF0, 0x04, 0xF0, 0x04, 0xF2, 0x04, 0xF2, 0x04, 0xF4, 0x04, 0xF4, 0x04, 0xF6, 0x04, 0xF6, 0x04,
+	0xF8, 0x04, 0xF8, 0x04, 0xFA, 0x04, 0xFA, 0x04, 0xFC, 0x04, 0xFC, 0x04, 0xFE, 0x04, 0xFE, 0x04,
+	0x00, 0x05, 0x00, 0x05, 0x02, 0x05, 0x02, 0x05, 0x04, 0x05, 0x04, 0x05, 0x06, 0x05, 0x06, 0x05,
+	0x08, 0x05, 0x08, 0x05, 0x0A, 0x05, 0x0A, 0x05, 0x0C, 0x05, 0x0C, 0x05, 0x0E, 0x05, 0x0E, 0x05,
+	0x10, 0x05, 0x10, 0x05, 0x12, 0x05, 0x12, 0x05, 0x14, 0x05, 0x15, 0x05, 0x16, 0x05, 0x17, 0x05,
+	0x18, 0x05, 0x19, 0x05, 0x1A, 0x05, 0x1B, 0x05, 0x1C, 0x05, 0x1D, 0x05, 0x1E, 0x05, 0x1F, 0x05,
+	0x20, 0x05, 0x21, 0x05, 0x22, 0x05, 0x23, 0x05, 0x24, 0x05, 0x25, 0x05, 0x26, 0x05, 0x27, 0x05,
+	0x28, 0x05, 0x29, 0x05, 0x2A, 0x05, 0x2B, 0x05, 0x2C, 0x05, 0x2D, 0x05, 0x2E, 0x05, 0x2F, 0x05,
+	0x30, 0x05, 0x31, 0x05, 0x32, 0x05, 0x33, 0x05, 0x34, 0x05, 0x35, 0x05, 0x36, 0x05, 0x37, 0x05,
+	0x38, 0x05, 0x39, 0x05, 0x3A, 0x05, 0x3B, 0x05, 0x3C, 0x05, 0x3D, 0x05, 0x3E, 0x05, 0x3F, 0x05,
+	0x40, 0x05, 0x41, 0x05, 0x42, 0x05, 0x43, 0x05, 0x44, 0x05, 0x45, 0x05, 0x46, 0x05, 0x47, 0x05,
+	0x48, 0x05, 0x49, 0x05, 0x4A, 0x05, 0x4B, 0x05, 0x4C, 0x05, 0x4D, 0x05, 0x4E, 0x05, 0x4F, 0x05,
+	0x50, 0x05, 0x51, 0x05, 0x52, 0x05, 0x53, 0x05, 0x54, 0x05, 0x55, 0x05, 0x56, 0x05, 0x57, 0x05,
+	0x58, 0x05, 0x59, 0x05, 0x5A, 0x05, 0x5B, 0x05, 0x5C, 0x05, 0x5D, 0x05, 0x5E, 0x05, 0x5F, 0x05,
+	0x60, 0x05, 0x31, 0x05, 0x32, 0x05, 0x33, 0x05, 0x34, 0x05, 0x35, 0x05, 0x36, 0x05, 0x37, 0x05,
+	0x38, 0x05, 0x39, 0x05, 0x3A, 0x05, 0x3B, 0x05, 0x3C, 0x05, 0x3D, 0x05, 0x3E, 0x05, 0x3F, 0x05,
+	0x40, 0x05, 0x41, 0x05, 0x42, 0x05, 0x43, 0x05, 0x44, 0x05, 0x45, 0x05, 0x46, 0x05, 0x47, 0x05,
+	0x48, 0x05, 0x49, 0x05, 0x4A, 0x05, 0x4B, 0x05, 0x4C, 0x05, 0x4D, 0x05, 0x4E, 0x05, 0x4F, 0x05,
+	0x50, 0x05, 0x51, 0x05, 0x52, 0x05, 0x53, 0x05, 0x54, 0x05, 0x55, 0x05, 0x56, 0x05, 0xFF, 0xFF,
+	0xF6, 0x17, 0x63, 0x2C, 0x7E, 0x1D, 0x7F, 0x1D, 0x80, 0x1D, 0x81, 0x1D, 0x82, 0x1D, 0x83, 0x1D,
+	0x84, 0x1D, 0x85, 0x1D, 0x86, 0x1D, 0x87, 0x1D, 0x88, 0x1D, 0x89, 0x1D, 0x8A, 0x1D, 0x8B, 0x1D,
+	0x8C, 0x1D, 0x8D, 0x1D, 0x8E, 0x1D, 0x8F, 0x1D, 0x90, 0x1D, 0x91, 0x1D, 0x92, 0x1D, 0x93, 0x1D,
+	0x94, 0x1D, 0x95, 0x1D, 0x96, 0x1D, 0x97, 0x1D, 0x98, 0x1D, 0x99, 0x1D, 0x9A, 0x1D, 0x9B, 0x1D,
+	0x9C, 0x1D, 0x9D, 0x1D, 0x9E, 0x1D, 0x9F, 0x1D, 0xA0, 0x1D, 0xA1, 0x1D, 0xA2, 0x1D, 0xA3, 0x1D,
+	0xA4, 0x1D, 0xA5, 0x1D, 0xA6, 0x1D, 0xA7, 0x1D, 0xA8, 0x1D, 0xA9, 0x1D, 0xAA, 0x1D, 0xAB, 0x1D,
+	0xAC, 0x1D, 0xAD, 0x1D, 0xAE, 0x1D, 0xAF, 0x1D, 0xB0, 0x1D, 0xB1, 0x1D, 0xB2, 0x1D, 0xB3, 0x1D,
+	0xB4, 0x1D, 0xB5, 0x1D, 0xB6, 0x1D, 0xB7, 0x1D, 0xB8, 0x1D, 0xB9, 0x1D, 0xBA, 0x1D, 0xBB, 0x1D,
+	0xBC, 0x1D, 0xBD, 0x1D, 0xBE, 0x1D, 0xBF, 0x1D, 0xC0, 0x1D, 0xC1, 0x1D, 0xC2, 0x1D, 0xC3, 0x1D,
+	0xC4, 0x1D, 0xC5, 0x1D, 0xC6, 0x1D, 0xC7, 0x1D, 0xC8, 0x1D, 0xC9, 0x1D, 0xCA, 0x1D, 0xCB, 0x1D,
+	0xCC, 0x1D, 0xCD, 0x1D, 0xCE, 0x1D, 0xCF, 0x1D, 0xD0, 0x1D, 0xD1, 0x1D, 0xD2, 0x1D, 0xD3, 0x1D,
+	0xD4, 0x1D, 0xD5, 0x1D, 0xD6, 0x1D, 0xD7, 0x1D, 0xD8, 0x1D, 0xD9, 0x1D, 0xDA, 0x1D, 0xDB, 0x1D,
+	0xDC, 0x1D, 0xDD, 0x1D, 0xDE, 0x1D, 0xDF, 0x1D, 0xE0, 0x1D, 0xE1, 0x1D, 0xE2, 0x1D, 0xE3, 0x1D,
+	0xE4, 0x1D, 0xE5, 0x1D, 0xE6, 0x1D, 0xE7, 0x1D, 0xE8, 0x1D, 0xE9, 0x1D, 0xEA, 0x1D, 0xEB, 0x1D,
+	0xEC, 0x1D, 0xED, 0x1D, 0xEE, 0x1D, 0xEF, 0x1D, 0xF0, 0x1D, 0xF1, 0x1D, 0xF2, 0x1D, 0xF3, 0x1D,
+	0xF4, 0x1D, 0xF5, 0x1D, 0xF6, 0x1D, 0xF7, 0x1D, 0xF8, 0x1D, 0xF9, 0x1D, 0xFA, 0x1D, 0xFB, 0x1D,
+	0xFC, 0x1D, 0xFD, 0x1D, 0xFE, 0x1D, 0xFF, 0x1D, 0x00, 0x1E, 0x00, 0x1E, 0x02, 0x1E, 0x02, 0x1E,
+	0x04, 0x1E, 0x04, 0x1E, 0x06, 0x1E, 0x06, 0x1E, 0x08, 0x1E, 0x08, 0x1E, 0x0A, 0x1E, 0x0A, 0x1E,
+	0x0C, 0x1E, 0x0C, 0x1E, 0x0E, 0x1E, 0x0E, 0x1E, 0x10, 0x1E, 0x10, 0x1E, 0x12, 0x1E, 0x12, 0x1E,
+	0x14, 0x1E, 0x14, 0x1E, 0x16, 0x1E, 0x16, 0x1E, 0x18, 0x1E, 0x18, 0x1E, 0x1A, 0x1E, 0x1A, 0x1E,
+	0x1C, 0x1E, 0x1C, 0x1E, 0x1E, 0x1E, 0x1E, 0x1E, 0x20, 0x1E, 0x20, 0x1E, 0x22, 0x1E, 0x22, 0x1E,
+	0x24, 0x1E, 0x24, 0x1E, 0x26, 0x1E, 0x26, 0x1E, 0x28, 0x1E, 0x28, 0x1E, 0x2A, 0x1E, 0x2A, 0x1E,
+	0x2C, 0x1E, 0x2C, 0x1E, 0x2E, 0x1E, 0x2E, 0x1E, 0x30, 0x1E, 0x30, 0x1E, 0x32, 0x1E, 0x32, 0x1E,
+	0x34, 0x1E, 0x34, 0x1E, 0x36, 0x1E, 0x36, 0x1E, 0x38, 0x1E, 0x38, 0x1E, 0x3A, 0x1E, 0x3A, 0x1E,
+	0x3C, 0x1E, 0x3C, 0x1E, 0x3E, 0x1E, 0x3E, 0x1E, 0x40, 0x1E, 0x40, 0x1E, 0x42, 0x1E, 0x42, 0x1E,
+	0x44, 0x1E, 0x44, 0x1E, 0x46, 0x1E, 0x46, 0x1E, 0x48, 0x1E, 0x48, 0x1E, 0x4A, 0x1E, 0x4A, 0x1E,
+	0x4C, 0x1E, 0x4C, 0x1E, 0x4E, 0x1E, 0x4E, 0x1E, 0x50, 0x1E, 0x50, 0x1E, 0x52, 0x1E, 0x52, 0x1E,
+	0x54, 0x1E, 0x54, 0x1E, 0x56, 0x1E, 0x56, 0x1E, 0x58, 0x1E, 0x58, 0x1E, 0x5A, 0x1E, 0x5A, 0x1E,
+	0x5C, 0x1E, 0x5C, 0x1E, 0x5E, 0x1E, 0x5E, 0x1E, 0x60, 0x1E, 0x60, 0x1E, 0x62, 0x1E, 0x62, 0x1E,
+	0x64, 0x1E, 0x64, 0x1E, 0x66, 0x1E, 0x66, 0x1E, 0x68, 0x1E, 0x68, 0x1E, 0x6A, 0x1E, 0x6A, 0x1E,
+	0x6C, 0x1E, 0x6C, 0x1E, 0x6E, 0x1E, 0x6E, 0x1E, 0x70, 0x1E, 0x70, 0x1E, 0x72, 0x1E, 0x72, 0x1E,
+	0x74, 0x1E, 0x74, 0x1E, 0x76, 0x1E, 0x76, 0x1E, 0x78, 0x1E, 0x78, 0x1E, 0x7A, 0x1E, 0x7A, 0x1E,
+	0x7C, 0x1E, 0x7C, 0x1E, 0x7E, 0x1E, 0x7E, 0x1E, 0x80, 0x1E, 0x80, 0x1E, 0x82, 0x1E, 0x82, 0x1E,
+	0x84, 0x1E, 0x84, 0x1E, 0x86, 0x1E, 0x86, 0x1E, 0x88, 0x1E, 0x88, 0x1E, 0x8A, 0x1E, 0x8A, 0x1E,
+	0x8C, 0x1E, 0x8C, 0x1E, 0x8E, 0x1E, 0x8E, 0x1E, 0x90, 0x1E, 0x90, 0x1E, 0x92, 0x1E, 0x92, 0x1E,
+	0x94, 0x1E, 0x94, 0x1E, 0x96, 0x1E, 0x97, 0x1E, 0x98, 0x1E, 0x99, 0x1E, 0x9A, 0x1E, 0x9B, 0x1E,
+	0x9C, 0x1E, 0x9D, 0x1E, 0x9E, 0x1E, 0x9F, 0x1E, 0xA0, 0x1E, 0xA0, 0x1E, 0xA2, 0x1E, 0xA2, 0x1E,
+	0xA4, 0x1E, 0xA4, 0x1E, 0xA6, 0x1E, 0xA6, 0x1E, 0xA8, 0x1E, 0xA8, 0x1E, 0xAA, 0x1E, 0xAA, 0x1E,
+	0xAC, 0x1E, 0xAC, 0x1E, 0xAE, 0x1E, 0xAE, 0x1E, 0xB0, 0x1E, 0xB0, 0x1E, 0xB2, 0x1E, 0xB2, 0x1E,
+	0xB4, 0x1E, 0xB4, 0x1E, 0xB6, 0x1E, 0xB6, 0x1E, 0xB8, 0x1E, 0xB8, 0x1E, 0xBA, 0x1E, 0xBA, 0x1E,
+	0xBC, 0x1E, 0xBC, 0x1E, 0xBE, 0x1E, 0xBE, 0x1E, 0xC0, 0x1E, 0xC0, 0x1E, 0xC2, 0x1E, 0xC2, 0x1E,
+	0xC4, 0x1E, 0xC4, 0x1E, 0xC6, 0x1E, 0xC6, 0x1E, 0xC8, 0x1E, 0xC8, 0x1E, 0xCA, 0x1E, 0xCA, 0x1E,
+	0xCC, 0x1E, 0xCC, 0x1E, 0xCE, 0x1E, 0xCE, 0x1E, 0xD0, 0x1E, 0xD0, 0x1E, 0xD2, 0x1E, 0xD2, 0x1E,
+	0xD4, 0x1E, 0xD4, 0x1E, 0xD6, 0x1E, 0xD6, 0x1E, 0xD8, 0x1E, 0xD8, 0x1E, 0xDA, 0x1E, 0xDA, 0x1E,
+	0xDC, 0x1E, 0xDC, 0x1E, 0xDE, 0x1E, 0xDE, 0x1E, 0xE0, 0x1E, 0xE0, 0x1E, 0xE2, 0x1E, 0xE2, 0x1E,
+	0xE4, 0x1E, 0xE4, 0x1E, 0xE6, 0x1E, 0xE6, 0x1E, 0xE8, 0x1E, 0xE8, 0x1E, 0xEA, 0x1E, 0xEA, 0x1E,
+	0xEC, 0x1E, 0xEC, 0x1E, 0xEE, 0x1E, 0xEE, 0x1E, 0xF0, 0x1E, 0xF0, 0x1E, 0xF2, 0x1E, 0xF2, 0x1E,
+	0xF4, 0x1E, 0xF4, 0x1E, 0xF6, 0x1E, 0xF6, 0x1E, 0xF8, 0x1E, 0xF8, 0x1E, 0xFA, 0x1E, 0xFB, 0x1E,
+	0xFC, 0x1E, 0xFD, 0x1E, 0xFE, 0x1E, 0xFF, 0x1E, 0x08, 0x1F, 0x09, 0x1F, 0x0A, 0x1F, 0x0B, 0x1F,
+	0x0C, 0x1F, 0x0D, 0x1F, 0x0E, 0x1F, 0x0F, 0x1F, 0x08, 0x1F, 0x09, 0x1F, 0x0A, 0x1F, 0x0B, 0x1F,
+	0x0C, 0x1F, 0x0D, 0x1F, 0x0E, 0x1F, 0x0F, 0x1F, 0x18, 0x1F, 0x19, 0x1F, 0x1A, 0x1F, 0x1B, 0x1F,
+	0x1C, 0x1F, 0x1D, 0x1F, 0x16, 0x1F, 0x17, 0x1F, 0x18, 0x1F, 0x19, 0x1F, 0x1A, 0x1F, 0x1B, 0x1F,
+	0x1C, 0x1F, 0x1D, 0x1F, 0x1E, 0x1F, 0x1F, 0x1F, 0x28, 0x1F, 0x29, 0x1F, 0x2A, 0x1F, 0x2B, 0x1F,
+	0x2C, 0x1F, 0x2D, 0x1F, 0x2E, 0x1F, 0x2F, 0x1F, 0x28, 0x1F, 0x29, 0x1F, 0x2A, 0x1F, 0x2B, 0x1F,
+	0x2C, 0x1F, 0x2D, 0x1F, 0x2E, 0x1F, 0x2F, 0x1F, 0x38, 0x1F, 0x39, 0x1F, 0x3A, 0x1F, 0x3B, 0x1F,
+	0x3C, 0x1F, 0x3D, 0x1F, 0x3E, 0x1F, 0x3F, 0x1F, 0x38, 0x1F, 0x39, 0x1F, 0x3A, 0x1F, 0x3B, 0x1F,
+	0x3C, 0x1F, 0x3D, 0x1F, 0x3E, 0x1F, 0x3F, 0x1F, 0x48, 0x1F, 0x49, 0x1F, 0x4A, 0x1F, 0x4B, 0x1F,
+	0x4C, 0x1F, 0x4D, 0x1F, 0x46, 0x1F, 0x47, 0x1F, 0x48, 0x1F, 0x49, 0x1F, 0x4A, 0x1F, 0x4B, 0x1F,
+	0x4C, 0x1F, 0x4D, 0x1F, 0x4E, 0x1F, 0x4F, 0x1F, 0x50, 0x1F, 0x59, 0x1F, 0x52, 0x1F, 0x5B, 0x1F,
+	0x54, 0x1F, 0x5D, 0x1F, 0x56, 0x1F, 0x5F, 0x1F, 0x58, 0x1F, 0x59, 0x1F, 0x5A, 0x1F, 0x5B, 0x1F,
+	0x5C, 0x1F, 0x5D, 0x1F, 0x5E, 0x1F, 0x5F, 0x1F, 0x68, 0x1F, 0x69, 0x1F, 0x6A, 0x1F, 0x6B, 0x1F,
+	0x6C, 0x1F, 0x6D, 0x1F, 0x6E, 0x1F, 0x6F, 0x1F, 0x68, 0x1F, 0x69, 0x1F, 0x6A, 0x1F, 0x6B, 0x1F,
+	0x6C, 0x1F, 0x6D, 0x1F, 0x6E, 0x1F, 0x6F, 0x1F, 0xBA, 0x1F, 0xBB, 0x1F, 0xC8, 0x1F, 0xC9, 0x1F,
+	0xCA, 0x1F, 0xCB, 0x1F, 0xDA, 0x1F, 0xDB, 0x1F, 0xF8, 0x1F, 0xF9, 0x1F, 0xEA, 0x1F, 0xEB, 0x1F,
+	0xFA, 0x1F, 0xFB, 0x1F, 0x7E, 0x1F, 0x7F, 0x1F, 0x88, 0x1F, 0x89, 0x1F, 0x8A, 0x1F, 0x8B, 0x1F,
+	0x8C, 0x1F, 0x8D, 0x1F, 0x8E, 0x1F, 0x8F, 0x1F, 0x88, 0x1F, 0x89, 0x1F, 0x8A, 0x1F, 0x8B, 0x1F,
+	0x8C, 0x1F, 0x8D, 0x1F, 0x8E, 0x1F, 0x8F, 0x1F, 0x98, 0x1F, 0x99, 0x1F, 0x9A, 0x1F, 0x9B, 0x1F,
+	0x9C, 0x1F, 0x9D, 0x1F, 0x9E, 0x1F, 0x9F, 0x1F, 0x98, 0x1F, 0x99, 0x1F, 0x9A, 0x1F, 0x9B, 0x1F,
+	0x9C, 0x1F, 0x9D, 0x1F, 0x9E, 0x1F, 0x9F, 0x1F, 0xA8, 0x1F, 0xA9, 0x1F, 0xAA, 0x1F, 0xAB, 0x1F,
+	0xAC, 0x1F, 0xAD, 0x1F, 0xAE, 0x1F, 0xAF, 0x1F, 0xA8, 0x1F, 0xA9, 0x1F, 0xAA, 0x1F, 0xAB, 0x1F,
+	0xAC, 0x1F, 0xAD, 0x1F, 0xAE, 0x1F, 0xAF, 0x1F, 0xB8, 0x1F, 0xB9, 0x1F, 0xB2, 0x1F, 0xBC, 0x1F,
+	0xB4, 0x1F, 0xB5, 0x1F, 0xB6, 0x1F, 0xB7, 0x1F, 0xB8, 0x1F, 0xB9, 0x1F, 0xBA, 0x1F, 0xBB, 0x1F,
+	0xBC, 0x1F, 0xBD, 0x1F, 0xBE, 0x1F, 0xBF, 0x1F, 0xC0, 0x1F, 0xC1, 0x1F, 0xC2, 0x1F, 0xC3, 0x1F,
+	0xC4, 0x1F, 0xC5, 0x1F, 0xC6, 0x1F, 0xC7, 0x1F, 0xC8, 0x1F, 0xC9, 0x1F, 0xCA, 0x1F, 0xCB, 0x1F,
+	0xC3, 0x1F, 0xCD, 0x1F, 0xCE, 0x1F, 0xCF, 0x1F, 0xD8, 0x1F, 0xD9, 0x1F, 0xD2, 0x1F, 0xD3, 0x1F,
+	0xD4, 0x1F, 0xD5, 0x1F, 0xD6, 0x1F, 0xD7, 0x1F, 0xD8, 0x1F, 0xD9, 0x1F, 0xDA, 0x1F, 0xDB, 0x1F,
+	0xDC, 0x1F, 0xDD, 0x1F, 0xDE, 0x1F, 0xDF, 0x1F, 0xE8, 0x1F, 0xE9, 0x1F, 0xE2, 0x1F, 0xE3, 0x1F,
+	0xE4, 0x1F, 0xEC, 0x1F, 0xE6, 0x1F, 0xE7, 0x1F, 0xE8, 0x1F, 0xE9, 0x1F, 0xEA, 0x1F, 0xEB, 0x1F,
+	0xEC, 0x1F, 0xED, 0x1F, 0xEE, 0x1F, 0xEF, 0x1F, 0xF0, 0x1F, 0xF1, 0x1F, 0xF2, 0x1F, 0xF3, 0x1F,
+	0xF4, 0x1F, 0xF5, 0x1F, 0xF6, 0x1F, 0xF7, 0x1F, 0xF8, 0x1F, 0xF9, 0x1F, 0xFA, 0x1F, 0xFB, 0x1F,
+	0xF3, 0x1F, 0xFD, 0x1F, 0xFE, 0x1F, 0xFF, 0x1F, 0x00, 0x20, 0x01, 0x20, 0x02, 0x20, 0x03, 0x20,
+	0x04, 0x20, 0x05, 0x20, 0x06, 0x20, 0x07, 0x20, 0x08, 0x20, 0x09, 0x20, 0x0A, 0x20, 0x0B, 0x20,
+	0x0C, 0x20, 0x0D, 0x20, 0x0E, 0x20, 0x0F, 0x20, 0x10, 0x20, 0x11, 0x20, 0x12, 0x20, 0x13, 0x20,
+	0x14, 0x20, 0x15, 0x20, 0x16, 0x20, 0x17, 0x20, 0x18, 0x20, 0x19, 0x20, 0x1A, 0x20, 0x1B, 0x20,
+	0x1C, 0x20, 0x1D, 0x20, 0x1E, 0x20, 0x1F, 0x20, 0x20, 0x20, 0x21, 0x20, 0x22, 0x20, 0x23, 0x20,
+	0x24, 0x20, 0x25, 0x20, 0x26, 0x20, 0x27, 0x20, 0x28, 0x20, 0x29, 0x20, 0x2A, 0x20, 0x2B, 0x20,
+	0x2C, 0x20, 0x2D, 0x20, 0x2E, 0x20, 0x2F, 0x20, 0x30, 0x20, 0x31, 0x20, 0x32, 0x20, 0x33, 0x20,
+	0x34, 0x20, 0x35, 0x20, 0x36, 0x20, 0x37, 0x20, 0x38, 0x20, 0x39, 0x20, 0x3A, 0x20, 0x3B, 0x20,
+	0x3C, 0x20, 0x3D, 0x20, 0x3E, 0x20, 0x3F, 0x20, 0x40, 0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20,
+	0x44, 0x20, 0x45, 0x20, 0x46, 0x20, 0x47, 0x20, 0x48, 0x20, 0x49, 0x20, 0x4A, 0x20, 0x4B, 0x20,
+	0x4C, 0x20, 0x4D, 0x20, 0x4E, 0x20, 0x4F, 0x20, 0x50, 0x20, 0x51, 0x20, 0x52, 0x20, 0x53, 0x20,
+	0x54, 0x20, 0x55, 0x20, 0x56, 0x20, 0x57, 0x20, 0x58, 0x20, 0x59, 0x20, 0x5A, 0x20, 0x5B, 0x20,
+	0x5C, 0x20, 0x5D, 0x20, 0x5E, 0x20, 0x5F, 0x20, 0x60, 0x20, 0x61, 0x20, 0x62, 0x20, 0x63, 0x20,
+	0x64, 0x20, 0x65, 0x20, 0x66, 0x20, 0x67, 0x20, 0x68, 0x20, 0x69, 0x20, 0x6A, 0x20, 0x6B, 0x20,
+	0x6C, 0x20, 0x6D, 0x20, 0x6E, 0x20, 0x6F, 0x20, 0x70, 0x20, 0x71, 0x20, 0x72, 0x20, 0x73, 0x20,
+	0x74, 0x20, 0x75, 0x20, 0x76, 0x20, 0x77, 0x20, 0x78, 0x20, 0x79, 0x20, 0x7A, 0x20, 0x7B, 0x20,
+	0x7C, 0x20, 0x7D, 0x20, 0x7E, 0x20, 0x7F, 0x20, 0x80, 0x20, 0x81, 0x20, 0x82, 0x20, 0x83, 0x20,
+	0x84, 0x20, 0x85, 0x20, 0x86, 0x20, 0x87, 0x20, 0x88, 0x20, 0x89, 0x20, 0x8A, 0x20, 0x8B, 0x20,
+	0x8C, 0x20, 0x8D, 0x20, 0x8E, 0x20, 0x8F, 0x20, 0x90, 0x20, 0x91, 0x20, 0x92, 0x20, 0x93, 0x20,
+	0x94, 0x20, 0x95, 0x20, 0x96, 0x20, 0x97, 0x20, 0x98, 0x20, 0x99, 0x20, 0x9A, 0x20, 0x9B, 0x20,
+	0x9C, 0x20, 0x9D, 0x20, 0x9E, 0x20, 0x9F, 0x20, 0xA0, 0x20, 0xA1, 0x20, 0xA2, 0x20, 0xA3, 0x20,
+	0xA4, 0x20, 0xA5, 0x20, 0xA6, 0x20, 0xA7, 0x20, 0xA8, 0x20, 0xA9, 0x20, 0xAA, 0x20, 0xAB, 0x20,
+	0xAC, 0x20, 0xAD, 0x20, 0xAE, 0x20, 0xAF, 0x20, 0xB0, 0x20, 0xB1, 0x20, 0xB2, 0x20, 0xB3, 0x20,
+	0xB4, 0x20, 0xB5, 0x20, 0xB6, 0x20, 0xB7, 0x20, 0xB8, 0x20, 0xB9, 0x20, 0xBA, 0x20, 0xBB, 0x20,
+	0xBC, 0x20, 0xBD, 0x20, 0xBE, 0x20, 0xBF, 0x20, 0xC0, 0x20, 0xC1, 0x20, 0xC2, 0x20, 0xC3, 0x20,
+	0xC4, 0x20, 0xC5, 0x20, 0xC6, 0x20, 0xC7, 0x20, 0xC8, 0x20, 0xC9, 0x20, 0xCA, 0x20, 0xCB, 0x20,
+	0xCC, 0x20, 0xCD, 0x20, 0xCE, 0x20, 0xCF, 0x20, 0xD0, 0x20, 0xD1, 0x20, 0xD2, 0x20, 0xD3, 0x20,
+	0xD4, 0x20, 0xD5, 0x20, 0xD6, 0x20, 0xD7, 0x20, 0xD8, 0x20, 0xD9, 0x20, 0xDA, 0x20, 0xDB, 0x20,
+	0xDC, 0x20, 0xDD, 0x20, 0xDE, 0x20, 0xDF, 0x20, 0xE0, 0x20, 0xE1, 0x20, 0xE2, 0x20, 0xE3, 0x20,
+	0xE4, 0x20, 0xE5, 0x20, 0xE6, 0x20, 0xE7, 0x20, 0xE8, 0x20, 0xE9, 0x20, 0xEA, 0x20, 0xEB, 0x20,
+	0xEC, 0x20, 0xED, 0x20, 0xEE, 0x20, 0xEF, 0x20, 0xF0, 0x20, 0xF1, 0x20, 0xF2, 0x20, 0xF3, 0x20,
+	0xF4, 0x20, 0xF5, 0x20, 0xF6, 0x20, 0xF7, 0x20, 0xF8, 0x20, 0xF9, 0x20, 0xFA, 0x20, 0xFB, 0x20,
+	0xFC, 0x20, 0xFD, 0x20, 0xFE, 0x20, 0xFF, 0x20, 0x00, 0x21, 0x01, 0x21, 0x02, 0x21, 0x03, 0x21,
+	0x04, 0x21, 0x05, 0x21, 0x06, 0x21, 0x07, 0x21, 0x08, 0x21, 0x09, 0x21, 0x0A, 0x21, 0x0B, 0x21,
+	0x0C, 0x21, 0x0D, 0x21, 0x0E, 0x21, 0x0F, 0x21, 0x10, 0x21, 0x11, 0x21, 0x12, 0x21, 0x13, 0x21,
+	0x14, 0x21, 0x15, 0x21, 0x16, 0x21, 0x17, 0x21, 0x18, 0x21, 0x19, 0x21, 0x1A, 0x21, 0x1B, 0x21,
+	0x1C, 0x21, 0x1D, 0x21, 0x1E, 0x21, 0x1F, 0x21, 0x20, 0x21, 0x21, 0x21, 0x22, 0x21, 0x23, 0x21,
+	0x24, 0x21, 0x25, 0x21, 0x26, 0x21, 0x27, 0x21, 0x28, 0x21, 0x29, 0x21, 0x2A, 0x21, 0x2B, 0x21,
+	0x2C, 0x21, 0x2D, 0x21, 0x2E, 0x21, 0x2F, 0x21, 0x30, 0x21, 0x31, 0x21, 0x32, 0x21, 0x33, 0x21,
+	0x34, 0x21, 0x35, 0x21, 0x36, 0x21, 0x37, 0x21, 0x38, 0x21, 0x39, 0x21, 0x3A, 0x21, 0x3B, 0x21,
+	0x3C, 0x21, 0x3D, 0x21, 0x3E, 0x21, 0x3F, 0x21, 0x40, 0x21, 0x41, 0x21, 0x42, 0x21, 0x43, 0x21,
+	0x44, 0x21, 0x45, 0x21, 0x46, 0x21, 0x47, 0x21, 0x48, 0x21, 0x49, 0x21, 0x4A, 0x21, 0x4B, 0x21,
+	0x4C, 0x21, 0x4D, 0x21, 0x32, 0x21, 0x4F, 0x21, 0x50, 0x21, 0x51, 0x21, 0x52, 0x21, 0x53, 0x21,
+	0x54, 0x21, 0x55, 0x21, 0x56, 0x21, 0x57, 0x21, 0x58, 0x21, 0x59, 0x21, 0x5A, 0x21, 0x5B, 0x21,
+	0x5C, 0x21, 0x5D, 0x21, 0x5E, 0x21, 0x5F, 0x21, 0x60, 0x21, 0x61, 0x21, 0x62, 0x21, 0x63, 0x21,
+	0x64, 0x21, 0x65, 0x21, 0x66, 0x21, 0x67, 0x21, 0x68, 0x21, 0x69, 0x21, 0x6A, 0x21, 0x6B, 0x21,
+	0x6C, 0x21, 0x6D, 0x21, 0x6E, 0x21, 0x6F, 0x21, 0x60, 0x21, 0x61, 0x21, 0x62, 0x21, 0x63, 0x21,
+	0x64, 0x21, 0x65, 0x21, 0x66, 0x21, 0x67, 0x21, 0x68, 0x21, 0x69, 0x21, 0x6A, 0x21, 0x6B, 0x21,
+	0x6C, 0x21, 0x6D, 0x21, 0x6E, 0x21, 0x6F, 0x21, 0x80, 0x21, 0x81, 0x21, 0x82, 0x21, 0x83, 0x21,
+	0x83, 0x21, 0xFF, 0xFF, 0x4B, 0x03, 0xB6, 0x24, 0xB7, 0x24, 0xB8, 0x24, 0xB9, 0x24, 0xBA, 0x24,
+	0xBB, 0x24, 0xBC, 0x24, 0xBD, 0x24, 0xBE, 0x24, 0xBF, 0x24, 0xC0, 0x24, 0xC1, 0x24, 0xC2, 0x24,
+	0xC3, 0x24, 0xC4, 0x24, 0xC5, 0x24, 0xC6, 0x24, 0xC7, 0x24, 0xC8, 0x24, 0xC9, 0x24, 0xCA, 0x24,
+	0xCB, 0x24, 0xCC, 0x24, 0xCD, 0x24, 0xCE, 0x24, 0xCF, 0x24, 0xFF, 0xFF, 0x46, 0x07, 0x00, 0x2C,
+	0x01, 0x2C, 0x02, 0x2C, 0x03, 0x2C, 0x04, 0x2C, 0x05, 0x2C, 0x06, 0x2C, 0x07, 0x2C, 0x08, 0x2C,
+	0x09, 0x2C, 0x0A, 0x2C, 0x0B, 0x2C, 0x0C, 0x2C, 0x0D, 0x2C, 0x0E, 0x2C, 0x0F, 0x2C, 0x10, 0x2C,
+	0x11, 0x2C, 0x12, 0x2C, 0x13, 0x2C, 0x14, 0x2C, 0x15, 0x2C, 0x16, 0x2C, 0x17, 0x2C, 0x18, 0x2C,
+	0x19, 0x2C, 0x1A, 0x2C, 0x1B, 0x2C, 0x1C, 0x2C, 0x1D, 0x2C, 0x1E, 0x2C, 0x1F, 0x2C, 0x20, 0x2C,
+	0x21, 0x2C, 0x22, 0x2C, 0x23, 0x2C, 0x24, 0x2C, 0x25, 0x2C, 0x26, 0x2C, 0x27, 0x2C, 0x28, 0x2C,
+	0x29, 0x2C, 0x2A, 0x2C, 0x2B, 0x2C, 0x2C, 0x2C, 0x2D, 0x2C, 0x2E, 0x2C, 0x5F, 0x2C, 0x60, 0x2C,
+	0x60, 0x2C, 0x62, 0x2C, 0x63, 0x2C, 0x64, 0x2C, 0x65, 0x2C, 0x66, 0x2C, 0x67, 0x2C, 0x67, 0x2C,
+	0x69, 0x2C, 0x69, 0x2C, 0x6B, 0x2C, 0x6B, 0x2C, 0x6D, 0x2C, 0x6E, 0x2C, 0x6F, 0x2C, 0x70, 0x2C,
+	0x71, 0x2C, 0x72, 0x2C, 0x73, 0x2C, 0x74, 0x2C, 0x75, 0x2C, 0x75, 0x2C, 0x77, 0x2C, 0x78, 0x2C,
+	0x79, 0x2C, 0x7A, 0x2C, 0x7B, 0x2C, 0x7C, 0x2C, 0x7D, 0x2C, 0x7E, 0x2C, 0x7F, 0x2C, 0x80, 0x2C,
+	0x80, 0x2C, 0x82, 0x2C, 0x82, 0x2C, 0x84, 0x2C, 0x84, 0x2C, 0x86, 0x2C, 0x86, 0x2C, 0x88, 0x2C,
+	0x88, 0x2C, 0x8A, 0x2C, 0x8A, 0x2C, 0x8C, 0x2C, 0x8C, 0x2C, 0x8E, 0x2C, 0x8E, 0x2C, 0x90, 0x2C,
+	0x90, 0x2C, 0x92, 0x2C, 0x92, 0x2C, 0x94, 0x2C, 0x94, 0x2C, 0x96, 0x2C, 0x96, 0x2C, 0x98, 0x2C,
+	0x98, 0x2C, 0x9A, 0x2C, 0x9A, 0x2C, 0x9C, 0x2C, 0x9C, 0x2C, 0x9E, 0x2C, 0x9E, 0x2C, 0xA0, 0x2C,
+	0xA0, 0x2C, 0xA2, 0x2C, 0xA2, 0x2C, 0xA4, 0x2C, 0xA4, 0x2C, 0xA6, 0x2C, 0xA6, 0x2C, 0xA8, 0x2C,
+	0xA8, 0x2C, 0xAA, 0x2C, 0xAA, 0x2C, 0xAC, 0x2C, 0xAC, 0x2C, 0xAE, 0x2C, 0xAE, 0x2C, 0xB0, 0x2C,
+	0xB0, 0x2C, 0xB2, 0x2C, 0xB2, 0x2C, 0xB4, 0x2C, 0xB4, 0x2C, 0xB6, 0x2C, 0xB6, 0x2C, 0xB8, 0x2C,
+	0xB8, 0x2C, 0xBA, 0x2C, 0xBA, 0x2C, 0xBC, 0x2C, 0xBC, 0x2C, 0xBE, 0x2C, 0xBE, 0x2C, 0xC0, 0x2C,
+	0xC0, 0x2C, 0xC2, 0x2C, 0xC2, 0x2C, 0xC4, 0x2C, 0xC4, 0x2C, 0xC6, 0x2C, 0xC6, 0x2C, 0xC8, 0x2C,
+	0xC8, 0x2C, 0xCA, 0x2C, 0xCA, 0x2C, 0xCC, 0x2C, 0xCC, 0x2C, 0xCE, 0x2C, 0xCE, 0x2C, 0xD0, 0x2C,
+	0xD0, 0x2C, 0xD2, 0x2C, 0xD2, 0x2C, 0xD4, 0x2C, 0xD4, 0x2C, 0xD6, 0x2C, 0xD6, 0x2C, 0xD8, 0x2C,
+	0xD8, 0x2C, 0xDA, 0x2C, 0xDA, 0x2C, 0xDC, 0x2C, 0xDC, 0x2C, 0xDE, 0x2C, 0xDE, 0x2C, 0xE0, 0x2C,
+	0xE0, 0x2C, 0xE2, 0x2C, 0xE2, 0x2C, 0xE4, 0x2C, 0xE5, 0x2C, 0xE6, 0x2C, 0xE7, 0x2C, 0xE8, 0x2C,
+	0xE9, 0x2C, 0xEA, 0x2C, 0xEB, 0x2C, 0xEC, 0x2C, 0xED, 0x2C, 0xEE, 0x2C, 0xEF, 0x2C, 0xF0, 0x2C,
+	0xF1, 0x2C, 0xF2, 0x2C, 0xF3, 0x2C, 0xF4, 0x2C, 0xF5, 0x2C, 0xF6, 0x2C, 0xF7, 0x2C, 0xF8, 0x2C,
+	0xF9, 0x2C, 0xFA, 0x2C, 0xFB, 0x2C, 0xFC, 0x2C, 0xFD, 0x2C, 0xFE, 0x2C, 0xFF, 0x2C, 0xA0, 0x10,
+	0xA1, 0x10, 0xA2, 0x10, 0xA3, 0x10, 0xA4, 0x10, 0xA5, 0x10, 0xA6, 0x10, 0xA7, 0x10, 0xA8, 0x10,
+	0xA9, 0x10, 0xAA, 0x10, 0xAB, 0x10, 0xAC, 0x10, 0xAD, 0x10, 0xAE, 0x10, 0xAF, 0x10, 0xB0, 0x10,
+	0xB1, 0x10, 0xB2, 0x10, 0xB3, 0x10, 0xB4, 0x10, 0xB5, 0x10, 0xB6, 0x10, 0xB7, 0x10, 0xB8, 0x10,
+	0xB9, 0x10, 0xBA, 0x10, 0xBB, 0x10, 0xBC, 0x10, 0xBD, 0x10, 0xBE, 0x10, 0xBF, 0x10, 0xC0, 0x10,
+	0xC1, 0x10, 0xC2, 0x10, 0xC3, 0x10, 0xC4, 0x10, 0xC5, 0x10, 0xFF, 0xFF, 0x1B, 0xD2, 0x21, 0xFF,
+	0x22, 0xFF, 0x23, 0xFF, 0x24, 0xFF, 0x25, 0xFF, 0x26, 0xFF, 0x27, 0xFF, 0x28, 0xFF, 0x29, 0xFF,
+	0x2A, 0xFF, 0x2B, 0xFF, 0x2C, 0xFF, 0x2D, 0xFF, 0x2E, 0xFF, 0x2F, 0xFF, 0x30, 0xFF, 0x31, 0xFF,
+	0x32, 0xFF, 0x33, 0xFF, 0x34, 0xFF, 0x35, 0xFF, 0x36, 0xFF, 0x37, 0xFF, 0x38, 0xFF, 0x39, 0xFF,
+	0x3A, 0xFF, 0x5B, 0xFF, 0x5C, 0xFF, 0x5D, 0xFF, 0x5E, 0xFF, 0x5F, 0xFF, 0x60, 0xFF, 0x61, 0xFF,
+	0x62, 0xFF, 0x63, 0xFF, 0x64, 0xFF, 0x65, 0xFF, 0x66, 0xFF, 0x67, 0xFF, 0x68, 0xFF, 0x69, 0xFF,
+	0x6A, 0xFF, 0x6B, 0xFF, 0x6C, 0xFF, 0x6D, 0xFF, 0x6E, 0xFF, 0x6F, 0xFF, 0x70, 0xFF, 0x71, 0xFF,
+	0x72, 0xFF, 0x73, 0xFF, 0x74, 0xFF, 0x75, 0xFF, 0x76, 0xFF, 0x77, 0xFF, 0x78, 0xFF, 0x79, 0xFF,
+	0x7A, 0xFF, 0x7B, 0xFF, 0x7C, 0xFF, 0x7D, 0xFF, 0x7E, 0xFF, 0x7F, 0xFF, 0x80, 0xFF, 0x81, 0xFF,
+	0x82, 0xFF, 0x83, 0xFF, 0x84, 0xFF, 0x85, 0xFF, 0x86, 0xFF, 0x87, 0xFF, 0x88, 0xFF, 0x89, 0xFF,
+	0x8A, 0xFF, 0x8B, 0xFF, 0x8C, 0xFF, 0x8D, 0xFF, 0x8E, 0xFF, 0x8F, 0xFF, 0x90, 0xFF, 0x91, 0xFF,
+	0x92, 0xFF, 0x93, 0xFF, 0x94, 0xFF, 0x95, 0xFF, 0x96, 0xFF, 0x97, 0xFF, 0x98, 0xFF, 0x99, 0xFF,
+	0x9A, 0xFF, 0x9B, 0xFF, 0x9C, 0xFF, 0x9D, 0xFF, 0x9E, 0xFF, 0x9F, 0xFF, 0xA0, 0xFF, 0xA1, 0xFF,
+	0xA2, 0xFF, 0xA3, 0xFF, 0xA4, 0xFF, 0xA5, 0xFF, 0xA6, 0xFF, 0xA7, 0xFF, 0xA8, 0xFF, 0xA9, 0xFF,
+	0xAA, 0xFF, 0xAB, 0xFF, 0xAC, 0xFF, 0xAD, 0xFF, 0xAE, 0xFF, 0xAF, 0xFF, 0xB0, 0xFF, 0xB1, 0xFF,
+	0xB2, 0xFF, 0xB3, 0xFF, 0xB4, 0xFF, 0xB5, 0xFF, 0xB6, 0xFF, 0xB7, 0xFF, 0xB8, 0xFF, 0xB9, 0xFF,
+	0xBA, 0xFF, 0xBB, 0xFF, 0xBC, 0xFF, 0xBD, 0xFF, 0xBE, 0xFF, 0xBF, 0xFF, 0xC0, 0xFF, 0xC1, 0xFF,
+	0xC2, 0xFF, 0xC3, 0xFF, 0xC4, 0xFF, 0xC5, 0xFF, 0xC6, 0xFF, 0xC7, 0xFF, 0xC8, 0xFF, 0xC9, 0xFF,
+	0xCA, 0xFF, 0xCB, 0xFF, 0xCC, 0xFF, 0xCD, 0xFF, 0xCE, 0xFF, 0xCF, 0xFF, 0xD0, 0xFF, 0xD1, 0xFF,
+	0xD2, 0xFF, 0xD3, 0xFF, 0xD4, 0xFF, 0xD5, 0xFF, 0xD6, 0xFF, 0xD7, 0xFF, 0xD8, 0xFF, 0xD9, 0xFF,
+	0xDA, 0xFF, 0xDB, 0xFF, 0xDC, 0xFF, 0xDD, 0xFF, 0xDE, 0xFF, 0xDF, 0xFF, 0xE0, 0xFF, 0xE1, 0xFF,
+	0xE2, 0xFF, 0xE3, 0xFF, 0xE4, 0xFF, 0xE5, 0xFF, 0xE6, 0xFF, 0xE7, 0xFF, 0xE8, 0xFF, 0xE9, 0xFF,
+	0xEA, 0xFF, 0xEB, 0xFF, 0xEC, 0xFF, 0xED, 0xFF, 0xEE, 0xFF, 0xEF, 0xFF, 0xF0, 0xFF, 0xF1, 0xFF,
+	0xF2, 0xFF, 0xF3, 0xFF, 0xF4, 0xFF, 0xF5, 0xFF, 0xF6, 0xFF, 0xF7, 0xFF, 0xF8, 0xFF, 0xF9, 0xFF,
+	0xFA, 0xFF, 0xFB, 0xFF, 0xFC, 0xFF, 0xFD, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF
+};
diff --git b/fs/exfat/exfat_version.h b/fs/exfat/exfat_version.h
new file mode 100644
index 0000000..a93fa46
--- /dev/null
+++ b/fs/exfat/exfat_version.h
@@ -0,0 +1,19 @@
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_version.h                                           */
+/*  PURPOSE : exFAT File Manager                                        */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY                                                    */
+/*                                                                      */
+/*  - 2012.02.10 : Release Version 1.1.0                                */
+/*  - 2012.04.02 : P1 : Change Module License to Samsung Proprietary    */
+/*  - 2012.06.07 : P2 : Fixed incorrect filename problem                */
+/*                                                                      */
+/************************************************************************/
+
+#define EXFAT_VERSION  "1.2.9"
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 350a2c8..04fd33c 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -29,7 +29,7 @@
 
 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
-static int setfl(int fd, struct file * filp, unsigned long arg)
+int setfl(int fd, struct file * filp, unsigned long arg)
 {
 	struct inode * inode = file_inode(filp);
 	int error = 0;
@@ -60,6 +60,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 
 	if (filp->f_op->check_flags)
 		error = filp->f_op->check_flags(arg);
+	if (!error && filp->f_op->setfl)
+		error = filp->f_op->setfl(filp, arg);
 	if (error)
 		return error;
 
@@ -80,6 +82,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
  out:
 	return error;
 }
+EXPORT_SYMBOL_GPL(setfl);
 
 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
                      int force)
diff --git a/fs/file_table.c b/fs/file_table.c
index ad17e05..ae9f267 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -147,6 +147,7 @@ over:
 	}
 	return ERR_PTR(-ENFILE);
 }
+EXPORT_SYMBOL_GPL(get_empty_filp);
 
 /**
  * alloc_file - allocate and initialize a 'struct file'
@@ -258,6 +259,7 @@ void flush_delayed_fput(void)
 {
 	delayed_fput(NULL);
 }
+EXPORT_SYMBOL_GPL(flush_delayed_fput);
 
 static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
 
@@ -300,6 +302,7 @@ void __fput_sync(struct file *file)
 }
 
 EXPORT_SYMBOL(fput);
+EXPORT_SYMBOL_GPL(__fput_sync);
 
 void put_filp(struct file *file)
 {
@@ -308,6 +311,7 @@ void put_filp(struct file *file)
 		file_free(file);
 	}
 }
+EXPORT_SYMBOL_GPL(put_filp);
 
 void __init files_init(void)
 { 
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fb1691..97654d2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -463,6 +463,7 @@ void __mnt_drop_write(struct vfsmount *mnt)
 	mnt_dec_writers(real_mount(mnt));
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(__mnt_drop_write);
 
 /**
  * mnt_drop_write - give up write access to a mount
@@ -1811,6 +1812,7 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(iterate_mounts);
 
 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
 {
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d16b62c..53e45b6 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -22,6 +22,7 @@
 #include <linux/srcu.h>
 #include <linux/rculist.h>
 #include <linux/wait.h>
+#include <linux/module.h>
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
@@ -72,6 +73,7 @@ void fsnotify_get_group(struct fsnotify_group *group)
 {
 	atomic_inc(&group->refcnt);
 }
+EXPORT_SYMBOL_GPL(fsnotify_get_group);
 
 /*
  * Drop a reference to a group.  Free it if it's through.
@@ -81,6 +83,7 @@ void fsnotify_put_group(struct fsnotify_group *group)
 	if (atomic_dec_and_test(&group->refcnt))
 		fsnotify_final_destroy_group(group);
 }
+EXPORT_SYMBOL_GPL(fsnotify_put_group);
 
 /*
  * Create a new fsnotify_group and hold a reference for the group returned.
@@ -109,6 +112,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 
 	return group;
 }
+EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
 
 int fsnotify_fasync(int fd, struct file *file, int on)
 {
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 7115c5d..ac2bd69 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -113,6 +113,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 		mark->free_mark(mark);
 	}
 }
+EXPORT_SYMBOL_GPL(fsnotify_put_mark);
 
 /* Calculate mask of events for a list of marks */
 u32 fsnotify_recalc_mask(struct hlist_head *head)
@@ -213,6 +214,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	mutex_unlock(&group->mark_mutex);
 	fsnotify_free_mark(mark);
 }
+EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
 
 void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
 {
@@ -398,6 +400,7 @@ err:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(fsnotify_add_mark);
 
 int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
 		      struct inode *inode, struct vfsmount *mnt, int allow_dups)
@@ -498,6 +501,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 	atomic_set(&mark->refcnt, 1);
 	mark->free_mark = free_mark;
 }
+EXPORT_SYMBOL_GPL(fsnotify_init_mark);
 
 static void fsnotify_mark_destroy(struct work_struct *work)
 {
diff --git a/fs/open.c b/fs/open.c
index 17cb6b1..e6933ee 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -34,6 +34,9 @@
 
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs.h>
+
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	struct file *filp)
 {
@@ -64,6 +67,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	inode_unlock(dentry->d_inode);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(do_truncate);
 
 long vfs_truncate(struct path *path, loff_t length)
 {
@@ -678,6 +682,7 @@ int open_check_o_direct(struct file *f)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(open_check_o_direct);
 
 static int do_dentry_open(struct file *f,
 			  struct inode *inode,
@@ -1024,6 +1029,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
 		} else {
 			fsnotify_open(f);
 			fd_install(fd, f);
+			trace_do_sys_open(tmp->name, flags, mode);
 		}
 	}
 	putname(tmp);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 45f2162..80e4bc6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -505,7 +505,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
 		seq_printf(m, "0 0 0\n");
 	else
 		seq_printf(m, "%llu %llu %lu\n",
-		   (unsigned long long)task->se.sum_exec_runtime,
+		   (unsigned long long)tsk_seruntime(task),
 		   (unsigned long long)task->sched_info.run_delay,
 		   task->sched_info.pcount);
 
@@ -1934,7 +1934,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
 	down_read(&mm->mmap_sem);
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
-		*path = vma->vm_file->f_path;
+		*path = vma_pr_or_file(vma)->f_path;
 		path_get(path);
 		rc = 0;
 	}
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index f8595e8..cb8eda0 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -45,7 +45,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 	file = region->vm_file;
 
 	if (file) {
-		struct inode *inode = file_inode(region->vm_file);
+		struct inode *inode;
+
+		file = vmr_pr_or_file(region);
+		inode = file_inode(file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 	}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9d2f3e0..6c4f2fb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -298,7 +298,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	const char *name = NULL;
 
 	if (file) {
-		struct inode *inode = file_inode(vma->vm_file);
+		struct inode *inode;
+
+		file = vma_pr_or_file(vma);
+		inode = file_inode(file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -1603,7 +1606,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md = &numa_priv->md;
-	struct file *file = vma->vm_file;
+	struct file *file = vma_pr_or_file(vma);
 	struct mm_struct *mm = vma->vm_mm;
 	struct mm_walk walk = {
 		.hugetlb_entry = gather_hugetlb_stats,
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index faacb0c..17b43be 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -163,7 +163,10 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 	file = vma->vm_file;
 
 	if (file) {
-		struct inode *inode = file_inode(vma->vm_file);
+		struct inode *inode;
+
+		file = vma_pr_or_file(vma);
+		inode = file_inode(file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
diff --git a/fs/read_write.c b/fs/read_write.c
index dadf24e..9461d12 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -534,6 +534,30 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 }
 EXPORT_SYMBOL(__vfs_write);
 
+vfs_readf_t vfs_readf(struct file *file)
+{
+	const struct file_operations *fop = file->f_op;
+
+	if (fop->read)
+		return fop->read;
+	if (fop->read_iter)
+		return new_sync_read;
+	return ERR_PTR(-ENOSYS);
+}
+EXPORT_SYMBOL_GPL(vfs_readf);
+
+vfs_writef_t vfs_writef(struct file *file)
+{
+	const struct file_operations *fop = file->f_op;
+
+	if (fop->write)
+		return fop->write;
+	if (fop->write_iter)
+		return new_sync_write;
+	return ERR_PTR(-ENOSYS);
+}
+EXPORT_SYMBOL_GPL(vfs_writef);
+
 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 {
 	mm_segment_t old_fs;
diff --git a/fs/splice.c b/fs/splice.c
index 19e0b10..3d8407c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1111,8 +1111,8 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 /*
  * Attempt to initiate a splice from pipe to file.
  */
-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-			   loff_t *ppos, size_t len, unsigned int flags)
+long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+		    loff_t *ppos, size_t len, unsigned int flags)
 {
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 				loff_t *, size_t, unsigned int);
@@ -1124,13 +1124,14 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 
 	return splice_write(pipe, out, ppos, len, flags);
 }
+EXPORT_SYMBOL_GPL(do_splice_from);
 
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, loff_t *ppos,
-			 struct pipe_inode_info *pipe, size_t len,
-			 unsigned int flags)
+long do_splice_to(struct file *in, loff_t *ppos,
+		  struct pipe_inode_info *pipe, size_t len,
+		  unsigned int flags)
 {
 	ssize_t (*splice_read)(struct file *, loff_t *,
 			       struct pipe_inode_info *, size_t, unsigned int);
@@ -1150,6 +1151,7 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 
 	return splice_read(in, ppos, pipe, len, flags);
 }
+EXPORT_SYMBOL_GPL(do_splice_to);
 
 /**
  * splice_direct_to_actor - splices data directly between two non-pipes
diff --git a/fs/super.c b/fs/super.c
index 74914b1..3446dc4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,7 +36,7 @@
 #include "internal.h"
 
 
-static LIST_HEAD(super_blocks);
+LIST_HEAD(super_blocks);
 static DEFINE_SPINLOCK(sb_lock);
 
 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
@@ -1372,3 +1372,50 @@ out:
 	return 0;
 }
 EXPORT_SYMBOL(thaw_super);
+
+/**
+ * lock_supers -- Stop other processes from modifying the list of super blocks.
+ */
+void take_super_lock() {
+    spin_lock(&sb_lock);
+}
+
+/**
+ * unlock_supers -- Allow other processes to again modify the list of super blocks.
+ */
+void release_super_lock() {
+    spin_unlock(&sb_lock);
+}
+
+/**
+ *	iterate_supers_no_sb_lock - call function for all active superblocks without using sb_lock
+ *
+ *	Note that the callback must work with sb_lock taken and not unlock it.
+ *
+ *	@f: function to call
+ *	@arg: argument to pass to it
+ *
+ *	Scans the superblock list and calls given function, passing it
+ *	locked superblock and given argument.
+ */
+void iterate_supers_no_sb_lock(void (*f)(struct super_block *, void *), void *arg)
+{
+	struct super_block *sb, *p = NULL;
+
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (hlist_unhashed(&sb->s_instances))
+			continue;
+		sb->s_count++;
+
+		down_read(&sb->s_umount);
+		if (sb->s_root && (sb->s_flags & MS_BORN))
+			f(sb, arg);
+		up_read(&sb->s_umount);
+
+		if (p)
+			__put_super(p);
+		p = sb;
+	}
+	if (p)
+		__put_super(p);
+}
diff --git a/fs/xattr.c b/fs/xattr.c
index 4861322..c4bb039 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -207,6 +207,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
 	*xattr_value = value;
 	return error;
 }
+EXPORT_SYMBOL_GPL(vfs_getxattr_alloc);
 
 ssize_t
 vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 88bc64f..b721a9d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -32,6 +32,8 @@
 /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
 #include <linux/blk_types.h>
 
+extern int trap_non_toi_io;
+
 #define BIO_DEBUG
 
 #ifdef BIO_DEBUG
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 86a38ea..fee9e23 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -120,13 +120,14 @@ struct bio {
 #define BIO_QUIET	6	/* Make BIO Quiet */
 #define BIO_CHAIN	7	/* chained bio, ->bi_remaining in effect */
 #define BIO_REFFED	8	/* bio has elevated ->bi_cnt */
+#define BIO_TOI		9	/* bio is TuxOnIce submitted */	
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
  * BIO_POOL_IDX()
  */
-#define BIO_RESET_BITS	13
-#define BIO_OWNS_VEC	13	/* bio_free() should free bvec */
+#define BIO_RESET_BITS	14
+#define BIO_OWNS_VEC	14	/* bio_free() should free bvec */
 
 /*
  * top 4 bits of bio flags indicate the pool this bio came from
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 413c84f..8fec976 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -39,13 +39,17 @@ struct blk_flush_queue;
 struct pr_ops;
 
 #define BLKDEV_MIN_RQ	4
+#ifdef CONFIG_PCK_INTERACTIVE
+#define BLKDEV_MAX_RQ	16
+#else
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
+#endif
 
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS		2
+#define BLKCG_MAX_POLS		3
 
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
diff --git a/include/linux/console.h b/include/linux/console.h
index ea731af..3a61cf7 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -119,7 +119,7 @@ static inline int con_debug_leave(void)
 
 struct console {
 	char	name[16];
-	void	(*write)(struct console *, const char *, unsigned);
+	void	(*write)(struct console *, const char *, unsigned, unsigned int);
 	int	(*read)(struct console *, char *, unsigned);
 	struct tty_driver *(*device)(struct console *, int *);
 	void	(*unblank)(void);
diff --git a/include/linux/file.h b/include/linux/file.h
index f87d308..9a290b3 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -19,6 +19,7 @@ struct dentry;
 struct path;
 extern struct file *alloc_file(struct path *, fmode_t mode,
 	const struct file_operations *fop);
+extern struct file *get_empty_filp(void);
 
 static inline void fput_light(struct file *file, int fput_needed)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 83c77b0..13eb8a1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1270,6 +1270,7 @@ extern void fasync_free(struct fasync_struct *);
 /* can be called from interrupts */
 extern void kill_fasync(struct fasync_struct **, int, int);
 
+extern int setfl(int fd, struct file * filp, unsigned long arg);
 extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
 extern void f_setown(struct file *filp, unsigned long arg, int force);
 extern void f_delown(struct file *filp);
@@ -1288,6 +1289,8 @@ struct mm_struct;
 #define UMOUNT_NOFOLLOW	0x00000008	/* Don't follow symlink on umount */
 #define UMOUNT_UNUSED	0x80000000	/* Flag guaranteed to be unused */
 
+extern struct list_head super_blocks;
+
 /* sb->s_iflags */
 #define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
 #define SB_I_NOEXEC	0x00000002	/* Ignore executables on this fs */
@@ -1656,6 +1659,7 @@ struct file_operations {
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
+	int (*setfl)(struct file *, unsigned long);
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
@@ -1714,6 +1718,12 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      struct iovec *fast_pointer,
 			      struct iovec **ret_pointer);
 
+typedef ssize_t (*vfs_readf_t)(struct file *, char __user *, size_t, loff_t *);
+typedef ssize_t (*vfs_writef_t)(struct file *, const char __user *, size_t,
+				loff_t *);
+vfs_readf_t vfs_readf(struct file *file);
+vfs_writef_t vfs_writef(struct file *file);
+
 extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
@@ -1784,6 +1794,8 @@ struct super_operations {
 #else
 #define S_DAX		0	/* Make all the DAX code disappear */
 #endif
+#define S_ATOMIC_COPY	16384	/* Pages mapped with this inode need to be
+				   atomically copied (gem) */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2316,6 +2328,13 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
+extern int fsync_super(struct super_block *);
+extern int fsync_no_super(struct block_device *);
+#define FS_FREEZER_FUSE 1
+#define FS_FREEZER_NORMAL 2
+#define FS_FREEZER_ALL (FS_FREEZER_FUSE | FS_FREEZER_NORMAL)
+void freeze_filesystems(int which);
+void thaw_filesystems(int which);
 #ifdef CONFIG_FS_DAX
 extern bool blkdev_dax_capable(struct block_device *bdev);
 #else
@@ -3088,4 +3107,7 @@ static inline bool dir_relax(struct inode *inode)
 extern bool path_noexec(const struct path *path);
 extern void inode_nohighmem(struct inode *inode);
 
+extern void take_super_lock(void);
+extern void release_super_lock(void);
+
 #endif /* _LINUX_FS_H */
diff --git b/include/linux/fs_uuid.h b/include/linux/fs_uuid.h
new file mode 100644
index 0000000..3234135
--- /dev/null
+++ b/include/linux/fs_uuid.h
@@ -0,0 +1,19 @@
+#include <linux/device.h>
+
+struct hd_struct;
+struct block_device;
+
+struct fs_info {
+	char uuid[16];
+	dev_t dev_t;
+	char *last_mount;
+	int last_mount_size;
+};
+
+int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek);
+dev_t blk_lookup_fs_info(struct fs_info *seek);
+struct fs_info *fs_info_from_block_dev(struct block_device *bdev);
+void free_fs_info(struct fs_info *fs_info);
+int bdev_matches_key(struct block_device *bdev, const char *key);
+struct block_device *next_bdev_of_type(struct block_device *last,
+	const char *key);
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c2b340e..6d9df3f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -713,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled)
 #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
 #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
 
+static inline unsigned long get_lock_parent_ip(void)
+{
+	unsigned long addr = CALLER_ADDR0;
+
+	if (!in_lock_functions(addr))
+		return addr;
+	addr = CALLER_ADDR1;
+	if (!in_lock_functions(addr))
+		return addr;
+	return CALLER_ADDR2;
+}
+
 #ifdef CONFIG_IRQSOFF_TRACER
   extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
   extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index af1f2b2..0b6edf7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -35,7 +35,8 @@ struct vm_area_struct;
 #define ___GFP_DIRECT_RECLAIM	0x400000u
 #define ___GFP_OTHER_NODE	0x800000u
 #define ___GFP_WRITE		0x1000000u
-#define ___GFP_KSWAPD_RECLAIM	0x2000000u
+#define ___GFP_TOI_NOTRACK      0x2000000u
+#define ___GFP_KSWAPD_RECLAIM	0x4000000u
 /* If the above are modified, __GFP_BITS_SHIFT may need updating */
 
 /*
@@ -153,6 +154,7 @@ struct vm_area_struct;
 #define __GFP_REPEAT	((__force gfp_t)___GFP_REPEAT)
 #define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)
 #define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY)
+#define __GFP_TOI_NOTRACK	((__force gfp_t)___GFP_TOI_NOTRACK)	/* Allocator wants page untracked by TOI */
 
 /*
  * Action modifiers
@@ -186,7 +188,7 @@ struct vm_area_struct;
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT 26
+#define __GFP_BITS_SHIFT 27
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /*
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f2cb8d4..056d322 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -157,8 +157,6 @@ extern struct task_group root_task_group;
 # define INIT_VTIME(tsk)
 #endif
 
-#define INIT_TASK_COMM "swapper"
-
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
 	.pi_waiters = RB_ROOT,						\
@@ -187,6 +185,78 @@ extern struct task_group root_task_group;
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
  */
+#ifdef CONFIG_SCHED_BFS
+#define INIT_TASK_COMM "BFS"
+#define INIT_TASK(tsk)	\
+{									\
+	.state		= 0,						\
+	.stack		= &init_thread_info,				\
+	.usage		= ATOMIC_INIT(2),				\
+	.flags		= PF_KTHREAD,					\
+	.prio		= NORMAL_PRIO,					\
+	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= NORMAL_PRIO,					\
+	.deadline	= 0,						\
+	.policy		= SCHED_NORMAL,					\
+	.cpus_allowed	= CPU_MASK_ALL,					\
+	.mm		= NULL,						\
+	.active_mm	= &init_mm,					\
+	.restart_block = {						\
+		.fn = do_no_restart_syscall,				\
+	},								\
+	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.time_slice	= HZ,					\
+	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	INIT_PUSHABLE_TASKS(tsk)					\
+	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
+	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
+	.real_parent	= &tsk,						\
+	.parent		= &tsk,						\
+	.children	= LIST_HEAD_INIT(tsk.children),			\
+	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.group_leader	= &tsk,						\
+	RCU_POINTER_INITIALIZER(real_cred, &init_cred),			\
+	RCU_POINTER_INITIALIZER(cred, &init_cred),			\
+	.comm		= INIT_TASK_COMM,				\
+	.thread		= INIT_THREAD,					\
+	.fs		= &init_fs,					\
+	.files		= &init_files,					\
+	.signal		= &init_signals,				\
+	.sighand	= &init_sighand,				\
+	.nsproxy	= &init_nsproxy,				\
+	.pending	= {						\
+		.list = LIST_HEAD_INIT(tsk.pending.list),		\
+		.signal = {{0}}},					\
+	.blocked	= {{0}},					\
+	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
+	.journal_info	= NULL,						\
+	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
+	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.pids = {							\
+		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
+		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
+		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
+	},								\
+	.thread_group	= LIST_HEAD_INIT(tsk.thread_group),		\
+	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),	\
+	INIT_IDS							\
+	INIT_PERF_EVENTS(tsk)						\
+	INIT_TRACE_IRQFLAGS						\
+	INIT_LOCKDEP							\
+	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
+	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_TASK_RCU_TASKS(tsk)					\
+	INIT_CPUSET_SEQ(tsk)						\
+	INIT_RT_MUTEXES(tsk)						\
+	INIT_PREV_CPUTIME(tsk)						\
+	INIT_VTIME(tsk)							\
+	INIT_NUMA_BALANCING(tsk)					\
+	INIT_KASAN(tsk)							\
+}
+#else /* CONFIG_SCHED_BFS */
+#define INIT_TASK_COMM "swapper"
 #define INIT_TASK(tsk)	\
 {									\
 	.state		= 0,						\
@@ -261,7 +331,7 @@ extern struct task_group root_task_group;
 	INIT_NUMA_BALANCING(tsk)					\
 	INIT_KASAN(tsk)							\
 }
-
+#endif /* CONFIG_SCHED_BFS */
 
 #define INIT_CPU_TIMERS(cpu_timers)					\
 {									\
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index beb9ce1..ce2fc3c 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -52,6 +52,8 @@ enum {
  */
 static inline int task_nice_ioprio(struct task_struct *task)
 {
+	if (iso_task(task))
+		return 0;
 	return (task_nice(task) + 20) / 5;
 }
 
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 5fdc553..9384572 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -164,7 +164,11 @@ static inline u64 get_jiffies_64(void)
  * Have the 32 bit jiffies value wrap 5 minutes after boot
  * so jiffies wrap bugs show up earlier.
  */
+#ifdef CONFIG_SCHED_BFS
+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ))
+#else
 #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
+#endif
 
 /*
  * Change timeval to jiffies, trying to avoid the
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 861f690..5276fe0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -25,6 +25,7 @@
 #include <linux/irqflags.h>
 #include <linux/context_tracking.h>
 #include <linux/irqbypass.h>
+#include <linux/swait.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -218,7 +219,7 @@ struct kvm_vcpu {
 	int fpu_active;
 	int guest_fpu_loaded, guest_xcr0_loaded;
 	unsigned char fpu_counter;
-	wait_queue_head_t wq;
+	struct swait_queue_head wq;
 	struct pid *pid;
 	int sigset_active;
 	sigset_t sigset;
@@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
 }
 #endif
 
-static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 {
 #ifdef __KVM_HAVE_ARCH_WQP
 	return vcpu->arch.wqp;
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index e23121f..59ccab2 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
 
 void clear_all_latency_tracing(struct task_struct *p);
 
+extern int sysctl_latencytop(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos);
+
 #else
 
 static inline void
diff --git a/include/linux/linux_logo.h b/include/linux/linux_logo.h
index ca5bd91..5489dcb 100644
--- a/include/linux/linux_logo.h
+++ b/include/linux/linux_logo.h
@@ -37,6 +37,18 @@ extern const struct linux_logo logo_linux_vga16;
 extern const struct linux_logo logo_linux_clut224;
 extern const struct linux_logo logo_blackfin_vga16;
 extern const struct linux_logo logo_blackfin_clut224;
+extern const struct linux_logo logo_zen_clut224;
+extern const struct linux_logo logo_oldzen_clut224;
+extern const struct linux_logo logo_arch_clut224;
+extern const struct linux_logo logo_gentoo_clut224;
+extern const struct linux_logo logo_exherbo_clut224;
+extern const struct linux_logo logo_slackware_clut224;
+extern const struct linux_logo logo_debian_clut224;
+extern const struct linux_logo logo_fedorasimple_clut224;
+extern const struct linux_logo logo_fedoraglossy_clut224;
+extern const struct linux_logo logo_tits_clut224;
+extern const struct linux_logo logo_bsd_clut224;
+extern const struct linux_logo logo_fbsd_clut224;
 extern const struct linux_logo logo_dec_clut224;
 extern const struct linux_logo logo_mac_clut224;
 extern const struct linux_logo logo_parisc_clut224;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a6c240e..3ef2050 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1221,6 +1221,28 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 }
 #endif
 
+extern void vma_do_file_update_time(struct vm_area_struct *, const char[], int);
+extern struct file *vma_do_pr_or_file(struct vm_area_struct *, const char[],
+				      int);
+extern void vma_do_get_file(struct vm_area_struct *, const char[], int);
+extern void vma_do_fput(struct vm_area_struct *, const char[], int);
+
+#define vma_file_update_time(vma)	vma_do_file_update_time(vma, __func__, \
+								__LINE__)
+#define vma_pr_or_file(vma)		vma_do_pr_or_file(vma, __func__, \
+							  __LINE__)
+#define vma_get_file(vma)		vma_do_get_file(vma, __func__, __LINE__)
+#define vma_fput(vma)			vma_do_fput(vma, __func__, __LINE__)
+
+#ifndef CONFIG_MMU
+extern struct file *vmr_do_pr_or_file(struct vm_region *, const char[], int);
+extern void vmr_do_fput(struct vm_region *, const char[], int);
+
+#define vmr_pr_or_file(region)		vmr_do_pr_or_file(region, __func__, \
+							  __LINE__)
+#define vmr_fput(region)		vmr_do_fput(region, __func__, __LINE__)
+#endif /* !CONFIG_MMU */
+
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, int write);
@@ -2229,6 +2251,7 @@ int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 #endif
 
+void drop_pagecache(void);
 void drop_slab(void);
 void drop_slab_node(int nid);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 624b78b..1be91c5 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -269,6 +269,7 @@ struct vm_region {
 	unsigned long	vm_top;		/* region allocated to here */
 	unsigned long	vm_pgoff;	/* the offset in vm_file corresponding to vm_start */
 	struct file	*vm_file;	/* the backing file or NULL */
+	struct file	*vm_prfile;	/* the virtual backing file or NULL */
 
 	int		vm_usage;	/* region usage count (access under nommu_region_sem) */
 	bool		vm_icache_flushed : 1; /* true if the icache has been flushed for
@@ -343,6 +344,7 @@ struct vm_area_struct {
 	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
 					   units, *not* PAGE_CACHE_SIZE */
 	struct file * vm_file;		/* File we map to (can be NULL). */
+	struct file *vm_prfile;		/* shadow of vm_file */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 
 #ifndef CONFIG_MMU
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 19724e6..2e7ecb7 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -101,6 +101,12 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 #endif
+#ifdef CONFIG_TOI_INCREMENTAL
+	PG_toi_untracked,	/* Don't track dirtiness of this page - assume always dirty */
+	PG_toi_ro,		/* Page was made RO by TOI */
+	PG_toi_cbw,		/* Copy the page before it is written to */
+	PG_toi_dirty,		/* Page has been modified */
+#endif
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
 	PG_young,
 	PG_idle,
@@ -343,6 +349,17 @@ TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
 PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
+#ifdef CONFIG_TOI_INCREMENTAL
+PAGEFLAG(TOI_RO, toi_ro)
+PAGEFLAG(TOI_Dirty, toi_dirty)
+PAGEFLAG(TOI_Untracked, toi_untracked)
+PAGEFLAG(TOI_CBW, toi_cbw)
+#else
+PAGEFLAG_FALSE(TOI_RO)
+PAGEFLAG_FALSE(TOI_Dirty)
+PAGEFLAG_FALSE(TOI_Untracked)
+PAGEFLAG_FALSE(TOI_CBW)
+#endif
 
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
 TESTPAGEFLAG(Young, young, PF_ANY)
@@ -665,8 +682,12 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
  * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
  * alloc-free cycle to prevent from reusing the page.
  */
-#define PAGE_FLAGS_CHECK_AT_PREP	\
-	(((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
+#ifdef CONFIG_TOI_INCREMENTAL
+#define PAGE_FLAGS_CHECK_AT_PREP	(((1 << NR_PAGEFLAGS) - 1) & \
+        ~((1 << PG_toi_dirty) | (1 << PG_toi_ro) | ~__PG_HWPOISON))
+#else
+#define PAGE_FLAGS_CHECK_AT_PREP	(((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
+#endif
 
 #define PAGE_FLAGS_PRIVATE				\
 	(1 << PG_private | 1 << PG_private_2)
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index d9cf5a5..7d5d0b8 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -19,8 +19,20 @@
  */
 
 #define MAX_USER_RT_PRIO	100
+
+#ifdef CONFIG_SCHED_BFS
+/* Note different MAX_RT_PRIO */
+#define MAX_RT_PRIO		(MAX_USER_RT_PRIO + 1)
+
+#define ISO_PRIO		(MAX_RT_PRIO)
+#define NORMAL_PRIO		(MAX_RT_PRIO + 1)
+#define IDLE_PRIO		(MAX_RT_PRIO + 2)
+#define PRIO_LIMIT		((IDLE_PRIO) + 1)
+#else /* CONFIG_SCHED_BFS */
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
+#endif /* CONFIG_SCHED_BFS */
+
 #define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)
 #define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)
 
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731..4f080ab 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
 
+extern int sysctl_schedstats(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos);
+
 #endif /* _SCHED_SYSCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..a1aec10 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,14 +176,12 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 
 extern void calc_global_load(unsigned long ticks);
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_BFS)
 extern void update_cpu_load_nohz(int active);
 #else
 static inline void update_cpu_load_nohz(int active) { }
 #endif
 
-extern unsigned long get_parent_ip(unsigned long addr);
-
 extern void dump_cpu_task(int cpu);
 
 struct seq_file;
@@ -339,8 +337,6 @@ extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern cpumask_var_t cpu_isolated_map;
 
-extern int runqueue_is_locked(int cpu);
-
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern void set_cpu_sd_state_idle(void);
@@ -920,6 +916,10 @@ static inline int sched_info_on(void)
 #endif
 }
 
+#ifdef CONFIG_SCHEDSTATS
+void force_schedstat_enabled(void);
+#endif
+
 enum cpu_idle_type {
 	CPU_IDLE,
 	CPU_NOT_IDLE,
@@ -1289,6 +1289,8 @@ struct sched_rt_entity {
 	unsigned long timeout;
 	unsigned long watchdog_stamp;
 	unsigned int time_slice;
+	unsigned short on_rq;
+	unsigned short on_list;
 
 	struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -1329,10 +1331,6 @@ struct sched_dl_entity {
 	 * task has to wait for a replenishment to be performed at the
 	 * next firing of dl_timer.
 	 *
-	 * @dl_new tells if a new instance arrived. If so we must
-	 * start executing it with full runtime and reset its absolute
-	 * deadline;
-	 *
 	 * @dl_boosted tells if we are boosted due to DI. If so we are
 	 * outside bandwidth enforcement mechanism (but only until we
 	 * exit the critical section);
@@ -1340,7 +1338,7 @@ struct sched_dl_entity {
 	 * @dl_yielded tells if task gave up the cpu before consuming
 	 * all its available runtime during the last job.
 	 */
-	int dl_throttled, dl_new, dl_boosted, dl_yielded;
+	int dl_throttled, dl_boosted, dl_yielded;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
@@ -1393,9 +1391,11 @@ struct task_struct {
 	unsigned int flags;	/* per process flags, defined below */
 	unsigned int ptrace;
 
+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BFS)
+	int on_cpu;
+#endif
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
-	int on_cpu;
 	unsigned int wakee_flips;
 	unsigned long wakee_flip_decay_ts;
 	struct task_struct *last_wakee;
@@ -1403,12 +1403,29 @@ struct task_struct {
 	int wake_cpu;
 #endif
 	int on_rq;
-
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;
+#ifdef CONFIG_SCHED_BFS
+	int time_slice;
+	u64 deadline;
+	struct list_head run_list;
+	u64 last_ran;
+	u64 sched_time; /* sched_clock time spent running */
+#ifdef CONFIG_SMT_NICE
+	int smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+#ifdef CONFIG_SMP
+	bool sticky; /* Soft affined flag */
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+	bool zerobound; /* Bound to CPU0 for hotplug */
+#endif
+	unsigned long rt_timeout;
+#else /* CONFIG_SCHED_BFS */
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
+#endif
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group *sched_task_group;
 #endif
@@ -1528,6 +1545,9 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	cputime_t utime, stime, utimescaled, stimescaled;
+#ifdef CONFIG_SCHED_BFS
+	unsigned long utime_pc, stime_pc;
+#endif
 	cputime_t gtime;
 	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -1846,6 +1866,63 @@ extern int arch_task_struct_size __read_mostly;
 # define arch_task_struct_size (sizeof(struct task_struct))
 #endif
 
+#ifdef CONFIG_SCHED_BFS
+bool grunqueue_is_locked(void);
+void grq_unlock_wait(void);
+void cpu_scaling(int cpu);
+void cpu_nonscaling(int cpu);
+#define tsk_seruntime(t)		((t)->sched_time)
+#define tsk_rttimeout(t)		((t)->rt_timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+}
+
+static inline int runqueue_is_locked(int cpu)
+{
+	return grunqueue_is_locked();
+}
+
+void print_scheduler_version(void);
+
+static inline bool iso_task(struct task_struct *p)
+{
+	return (p->policy == SCHED_ISO);
+}
+#else /* CFS */
+extern int runqueue_is_locked(int cpu);
+static inline void cpu_scaling(int cpu)
+{
+}
+
+static inline void cpu_nonscaling(int cpu)
+{
+}
+#define tsk_seruntime(t)	((t)->se.sum_exec_runtime)
+#define tsk_rttimeout(t)	((t)->rt.timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+	p->nr_cpus_allowed = current->nr_cpus_allowed;
+}
+
+static inline void print_scheduler_version(void)
+{
+	printk(KERN_INFO"CFS CPU scheduler.\n");
+}
+
+static inline bool iso_task(struct task_struct *p)
+{
+	return false;
+}
+
+/* Anyone feel like implementing this? */
+static inline bool above_background_load(void)
+{
+	return false;
+}
+#endif /* CONFIG_SCHED_BFS */
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
@@ -2261,7 +2338,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 }
 #endif
 
-#ifdef CONFIG_NO_HZ_COMMON
+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_BFS)
 void calc_load_enter_idle(void);
 void calc_load_exit_idle(void);
 #else
@@ -2334,7 +2411,7 @@ extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BFS)
 extern void sched_exec(void);
 #else
 #define sched_exec()   {}
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index faa0e03..0d4ef69 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -153,7 +153,7 @@ unsigned int serial8250_modem_status(struct uart_8250_port *up);
 void serial8250_init_port(struct uart_8250_port *up);
 void serial8250_set_defaults(struct uart_8250_port *up);
 void serial8250_console_write(struct uart_8250_port *up, const char *s,
-			      unsigned int count);
+			      unsigned int count, unsigned int loglevel);
 int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
 
 extern void serial8250_set_isa_configurator(void (*v)
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 4d4780c..665cd85 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -45,9 +45,10 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
 extern int shmem_init(void);
 extern int shmem_fill_super(struct super_block *sb, void *data, int silent);
 extern struct file *shmem_file_setup(const char *name,
-					loff_t size, unsigned long flags);
+					loff_t size, unsigned long flags,
+					int atomic_copy);
 extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
-					    unsigned long flags);
+					    unsigned long flags, int atomic_copy);
 extern int shmem_zero_setup(struct vm_area_struct *);
 extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
 extern bool shmem_mapping(struct address_space *mapping);
diff --git a/include/linux/splice.h b/include/linux/splice.h
index da2751d..2e0fca6 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -83,4 +83,10 @@ extern void splice_shrink_spd(struct splice_pipe_desc *);
 extern void spd_release_page(struct splice_pipe_desc *, unsigned int);
 
 extern const struct pipe_buf_operations page_cache_pipe_buf_ops;
+
+extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+			   loff_t *ppos, size_t len, unsigned int flags);
+extern long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags);
 #endif
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 8b6ec7e..0e527a7 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -491,6 +491,74 @@ extern bool pm_print_times_enabled;
 #define pm_print_times_enabled	(false)
 #endif
 
+enum {
+	TOI_CAN_HIBERNATE,
+	TOI_CAN_RESUME,
+	TOI_RESUME_DEVICE_OK,
+	TOI_NORESUME_SPECIFIED,
+	TOI_SANITY_CHECK_PROMPT,
+	TOI_CONTINUE_REQ,
+	TOI_RESUMED_BEFORE,
+	TOI_BOOT_TIME,
+	TOI_NOW_RESUMING,
+	TOI_IGNORE_LOGLEVEL,
+	TOI_TRYING_TO_RESUME,
+	TOI_LOADING_ALT_IMAGE,
+	TOI_STOP_RESUME,
+	TOI_IO_STOPPED,
+	TOI_NOTIFIERS_PREPARE,
+	TOI_CLUSTER_MODE,
+	TOI_BOOT_KERNEL,
+	TOI_DEVICE_HOTPLUG_LOCKED,
+};
+
+#ifdef CONFIG_TOI
+
+/* Used in init dir files */
+extern unsigned long toi_state;
+#define set_toi_state(bit) (set_bit(bit, &toi_state))
+#define clear_toi_state(bit) (clear_bit(bit, &toi_state))
+#define test_toi_state(bit) (test_bit(bit, &toi_state))
+extern int toi_running;
+
+#define test_action_state(bit) (test_bit(bit, &toi_bkd.toi_action))
+extern int try_tuxonice_hibernate(void);
+
+#else /* !CONFIG_TOI */
+
+#define toi_state		(0)
+#define set_toi_state(bit) do { } while (0)
+#define clear_toi_state(bit) do { } while (0)
+#define test_toi_state(bit) (0)
+#define toi_running (0)
+
+static inline int try_tuxonice_hibernate(void) { return 0; }
+#define test_action_state(bit) (0)
+
+#endif /* CONFIG_TOI */
+
+#ifdef CONFIG_HIBERNATION
+#ifdef CONFIG_TOI
+extern void try_tuxonice_resume(void);
+#else
+#define try_tuxonice_resume() do { } while (0)
+#endif
+
+extern int resume_attempted;
+extern int software_resume(void);
+
+static inline void check_resume_attempted(void)
+{
+	if (resume_attempted)
+		return;
+
+	software_resume();
+}
+#else
+#define check_resume_attempted() do { } while (0)
+#define resume_attempted (0)
+#endif
+
 #ifdef CONFIG_PM_AUTOSLEEP
 
 /* kernel/power/autosleep.c */
diff --git b/include/linux/swait.h b/include/linux/swait.h
new file mode 100644
index 0000000..c1f9c62
--- /dev/null
+++ b/include/linux/swait.h
@@ -0,0 +1,172 @@
+#ifndef _LINUX_SWAIT_H
+#define _LINUX_SWAIT_H
+
+#include <linux/list.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <asm/current.h>
+
+/*
+ * Simple wait queues
+ *
+ * While these are very similar to the other/complex wait queues (wait.h) the
+ * most important difference is that the simple waitqueue allows for
+ * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
+ * times.
+ *
+ * In order to make this so, we had to drop a fair number of features of the
+ * other waitqueue code; notably:
+ *
+ *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
+ *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
+ *    sleeper state.
+ *
+ *  - the exclusive mode; because this requires preserving the list order
+ *    and this is hard.
+ *
+ *  - custom wake functions; because you cannot give any guarantees about
+ *    random code.
+ *
+ * As a side effect of this; the data structures are slimmer.
+ *
+ * One would recommend using this wait queue where possible.
+ */
+
+struct task_struct;
+
+struct swait_queue_head {
+	raw_spinlock_t		lock;
+	struct list_head	task_list;
+};
+
+struct swait_queue {
+	struct task_struct	*task;
+	struct list_head	task_list;
+};
+
+#define __SWAITQUEUE_INITIALIZER(name) {				\
+	.task		= current,					\
+	.task_list	= LIST_HEAD_INIT((name).task_list),		\
+}
+
+#define DECLARE_SWAITQUEUE(name)					\
+	struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
+
+#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {				\
+	.lock		= __RAW_SPIN_LOCK_UNLOCKED(name.lock),		\
+	.task_list	= LIST_HEAD_INIT((name).task_list),		\
+}
+
+#define DECLARE_SWAIT_QUEUE_HEAD(name)					\
+	struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
+
+extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+				    struct lock_class_key *key);
+
+#define init_swait_queue_head(q)				\
+	do {							\
+		static struct lock_class_key __key;		\
+		__init_swait_queue_head((q), #q, &__key);	\
+	} while (0)
+
+#ifdef CONFIG_LOCKDEP
+# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)			\
+	({ init_swait_queue_head(&name); name; })
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)			\
+	struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
+#else
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)			\
+	DECLARE_SWAIT_QUEUE_HEAD(name)
+#endif
+
+static inline int swait_active(struct swait_queue_head *q)
+{
+	return !list_empty(&q->task_list);
+}
+
+extern void swake_up(struct swait_queue_head *q);
+extern void swake_up_all(struct swait_queue_head *q);
+extern void swake_up_locked(struct swait_queue_head *q);
+
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
+extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
+
+extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
+#define ___swait_event(wq, condition, state, ret, cmd)			\
+({									\
+	struct swait_queue __wait;					\
+	long __ret = ret;						\
+									\
+	INIT_LIST_HEAD(&__wait.task_list);				\
+	for (;;) {							\
+		long __int = prepare_to_swait_event(&wq, &__wait, state);\
+									\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			break;						\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_swait(&wq, &__wait);					\
+	__ret;								\
+})
+
+#define __swait_event(wq, condition)					\
+	(void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,	\
+			    schedule())
+
+#define swait_event(wq, condition)					\
+do {									\
+	if (condition)							\
+		break;							\
+	__swait_event(wq, condition);					\
+} while (0)
+
+#define __swait_event_timeout(wq, condition, timeout)			\
+	___swait_event(wq, ___wait_cond_timeout(condition),		\
+		      TASK_UNINTERRUPTIBLE, timeout,			\
+		      __ret = schedule_timeout(__ret))
+
+#define swait_event_timeout(wq, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __swait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+#define __swait_event_interruptible(wq, condition)			\
+	___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,		\
+		      schedule())
+
+#define swait_event_interruptible(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__ret = __swait_event_interruptible(wq, condition);	\
+	__ret;								\
+})
+
+#define __swait_event_interruptible_timeout(wq, condition, timeout)	\
+	___swait_event(wq, ___wait_cond_timeout(condition),		\
+		      TASK_INTERRUPTIBLE, timeout,			\
+		      __ret = schedule_timeout(__ret))
+
+#define swait_event_interruptible_timeout(wq, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __swait_event_interruptible_timeout(wq,		\
+						condition, timeout);	\
+	__ret;								\
+})
+
+#endif /* _LINUX_SWAIT_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5fa4aa4..67f6507 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -288,6 +288,7 @@ static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
 extern unsigned long nr_free_buffer_pages(void);
+extern unsigned long nr_unallocated_buffer_pages(void);
 extern unsigned long nr_free_pagecache_pages(void);
 
 /* Definition of global_page_state not available yet */
@@ -328,6 +329,8 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						struct zone *zone,
 						unsigned long *nr_scanned);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
+extern unsigned long shrink_memory_mask(unsigned long nr_to_reclaim,
+		gfp_t mask);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
@@ -413,14 +416,18 @@ extern void swapcache_free(swp_entry_t);
 extern int free_swap_and_cache(swp_entry_t);
 extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
+extern sector_t map_swap_entry(swp_entry_t entry, struct block_device **);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
+extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int page_swapcount(struct page *);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
+extern void get_swap_range_of_type(int type, swp_entry_t *start,
+		swp_entry_t *end, unsigned int limit);
 
 #else /* CONFIG_SWAP */
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b386361..4e92c0f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -19,6 +19,7 @@
 
 
 #include <linux/skbuff.h>
+#include <linux/cryptohash.h>
 #include <net/sock.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -336,6 +337,21 @@ struct tcp_sock {
 	struct tcp_md5sig_info	__rcu *md5sig_info;
 #endif
 
+#ifdef CONFIG_TCP_STEALTH
+/* Stealth TCP socket configuration */
+	struct {
+		#define TCP_STEALTH_MODE_AUTH		BIT(0)
+		#define TCP_STEALTH_MODE_INTEGRITY	BIT(1)
+		#define TCP_STEALTH_MODE_INTEGRITY_LEN	BIT(2)
+		u8 mode;
+		u8 secret[MD5_MESSAGE_BYTES];
+		u16 integrity_hash;
+		size_t integrity_len;
+		struct skb_mstamp mstamp;
+		bool saw_tsval;
+	} stealth;
+#endif
+
 /* TCP fastopen related information */
 	struct tcp_fastopen_request *fastopen_req;
 	/* fastopen_rsk points to request_sock that resulted in this big
diff --git b/include/linux/thinkpad_ec.h b/include/linux/thinkpad_ec.h
new file mode 100644
index 0000000..1b80d7e
--- /dev/null
+++ b/include/linux/thinkpad_ec.h
@@ -0,0 +1,47 @@
+/*
+ *  thinkpad_ec.h - interface to ThinkPad embedded controller LPC3 functions
+ *
+ *  Copyright (C) 2005 Shem Multinymous <multinymous@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _THINKPAD_EC_H
+#define _THINKPAD_EC_H
+
+#ifdef __KERNEL__
+
+#define TP_CONTROLLER_ROW_LEN 16
+
+/* EC transactions input and output (possibly partial) vectors of 16 bytes. */
+struct thinkpad_ec_row {
+	u16 mask; /* bitmap of which entries of val[] are meaningful */
+	u8 val[TP_CONTROLLER_ROW_LEN];
+};
+
+extern int __must_check thinkpad_ec_lock(void);
+extern int __must_check thinkpad_ec_try_lock(void);
+extern void thinkpad_ec_unlock(void);
+
+extern int thinkpad_ec_read_row(const struct thinkpad_ec_row *args,
+				struct thinkpad_ec_row *data);
+extern int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args,
+				    struct thinkpad_ec_row *mask);
+extern int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args);
+extern void thinkpad_ec_invalidate(void);
+
+
+#endif /* __KERNEL */
+#endif /* _THINKPAD_EC_H */
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index b4c2a48..c54e16a 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -56,10 +56,10 @@ extern long do_no_restart_syscall(struct restart_block *parm);
 #ifdef __KERNEL__
 
 #ifdef CONFIG_DEBUG_STACK_USAGE
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | ___GFP_TOI_NOTRACK | \
 				 __GFP_ZERO)
 #else
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | ___GFP_TOI_NOTRACK)
 #endif
 
 /*
diff --git b/include/linux/tuxonice.h b/include/linux/tuxonice.h
new file mode 100644
index 0000000..67b05a7
--- /dev/null
+++ b/include/linux/tuxonice.h
@@ -0,0 +1,48 @@
+/*
+ * include/linux/tuxonice.h
+ *
+ * Copyright (C) 2015 Nigel Cunningham (nigel at tuxonice net)
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef INCLUDE_LINUX_TUXONICE_H
+#define INCLUDE_LINUX_TUXONICE_H
+#ifdef CONFIG_TOI_INCREMENTAL
+extern void toi_set_logbuf_untracked(void);
+extern int toi_make_writable(pgd_t *pgd, unsigned long address);
+
+static inline int toi_incremental_support(void)
+{
+    return 1;
+}
+
+/* Copy Before Write */
+struct toi_cbw {
+    unsigned long pfn;
+    void *virt;
+    struct toi_cbw *next;
+};
+
+struct toi_cbw_state {
+    bool active;            /* Is a fault handler running? */
+    bool enabled;           /* Are we doing copy before write? */
+    int size;               /* The number of pages allocated */
+    struct toi_cbw *first, *next, *last;  /* Pointers to the data structure */
+};
+
+#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw))
+DECLARE_PER_CPU(struct toi_cbw_state, toi_cbw_states);
+#else
+#define toi_set_logbuf_untracked() do { } while(0)
+static inline int toi_make_writable(pgd_t *pgd, unsigned long addr)
+{
+    return 0;
+}
+
+static inline int toi_incremental_support(void)
+{
+    return 0;
+}
+#endif
+#endif
diff --git a/include/linux/wait.h b/include/linux/wait.h
index ae71a76..27d7a0a 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -338,7 +338,7 @@ do {									\
 			    schedule(); try_to_freeze())
 
 /**
- * wait_event - sleep (or freeze) until a condition gets true
+ * wait_event_freezable - sleep (or freeze) until a condition gets true
  * @wq: the waitqueue to wait on
  * @condition: a C expression for the event to wait for
  *
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index 3f36d45..ec392a1 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -14,5 +14,10 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 				__be16 sport, __be16 dport);
 u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
 				  __be16 sport, __be16 dport);
+#ifdef CONFIG_TCP_STEALTH
+u32 tcp_stealth_do_auth(struct sock *sk, struct sk_buff *skb);
+u32 tcp_stealth_sequence_number(struct sock *sk, __be32 *daddr,
+				u32 daddr_size, __be16 dport);
+#endif
 
 #endif /* _NET_SECURE_SEQ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ae6468f..7b9c0ff 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -440,6 +440,12 @@ void tcp_parse_options(const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx,
 		       int estab, struct tcp_fastopen_cookie *foc);
 const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
+#ifdef CONFIG_TCP_STEALTH
+const bool tcp_parse_tsval_option(u32 *tsval, const struct tcphdr *th);
+int tcp_stealth_integrity(u16 *hash, u8 *secret, u8 *payload, int len);
+#define be32_isn_to_be16_av(x)	(((__be16 *)&x)[0])
+#define be32_isn_to_be16_ih(x)	(((__be16 *)&x)[1])
+#endif
 
 /*
  *	TCP v4 functions exported for the inet6 API
diff --git b/include/trace/events/fs.h b/include/trace/events/fs.h
new file mode 100644
index 0000000..fb634b7
--- /dev/null
+++ b/include/trace/events/fs.h
@@ -0,0 +1,53 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fs
+
+#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FS_H
+
+#include <linux/fs.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(do_sys_open,
+
+	TP_PROTO(const char *filename, int flags, int mode),
+
+	TP_ARGS(filename, flags, mode),
+
+	TP_STRUCT__entry(
+		__string(	filename, filename		)
+		__field(	int, flags			)
+		__field(	int, mode			)
+	),
+
+	TP_fast_assign(
+		__assign_str(filename, filename);
+		__entry->flags = flags;
+		__entry->mode = mode;
+	),
+
+	TP_printk("\"%s\" %x %o",
+		  __get_str(filename), __entry->flags, __entry->mode)
+);
+
+TRACE_EVENT(open_exec,
+
+	TP_PROTO(const char *filename),
+
+	TP_ARGS(filename),
+
+	TP_STRUCT__entry(
+		__string(	filename, filename		)
+	),
+
+	TP_fast_assign(
+		__assign_str(filename, filename);
+	),
+
+	TP_printk("\"%s\"",
+		  __get_str(filename))
+);
+
+#endif /* _TRACE_FS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ebd10e6..32152e7 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -59,6 +59,7 @@ header-y += atmsvc.h
 header-y += atm_tcp.h
 header-y += atm_zatm.h
 header-y += audit.h
+header-y += aufs_type.h
 header-y += auto_fs4.h
 header-y += auto_fs.h
 header-y += auxvec.h
diff --git b/include/uapi/linux/aufs_type.h b/include/uapi/linux/aufs_type.h
new file mode 100644
index 0000000..e783d44
--- /dev/null
+++ b/include/uapi/linux/aufs_type.h
@@ -0,0 +1,406 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+#ifndef __AUFS_TYPE_H__
+#define __AUFS_TYPE_H__
+
+#define AUFS_NAME	"aufs"
+
+#ifdef __KERNEL__
+/*
+ * define it before including all other headers.
+ * sched.h may use pr_* macros before defining "current", so define the
+ * no-current version first, and re-define later.
+ */
+#define pr_fmt(fmt)	AUFS_NAME " %s:%d: " fmt, __func__, __LINE__
+#include <linux/sched.h>
+#undef pr_fmt
+#define pr_fmt(fmt) \
+		AUFS_NAME " %s:%d:%.*s[%d]: " fmt, __func__, __LINE__, \
+		(int)sizeof(current->comm), current->comm, current->pid
+#else
+#include <stdint.h>
+#include <sys/types.h>
+#endif /* __KERNEL__ */
+
+#include <linux/limits.h>
+
+#define AUFS_VERSION	"4.5"
+
+/* todo? move this to linux-2.6.19/include/magic.h */
+#define AUFS_SUPER_MAGIC	('a' << 24 | 'u' << 16 | 'f' << 8 | 's')
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_BRANCH_MAX_127
+typedef int8_t aufs_bindex_t;
+#define AUFS_BRANCH_MAX 127
+#else
+typedef int16_t aufs_bindex_t;
+#ifdef CONFIG_AUFS_BRANCH_MAX_511
+#define AUFS_BRANCH_MAX 511
+#elif defined(CONFIG_AUFS_BRANCH_MAX_1023)
+#define AUFS_BRANCH_MAX 1023
+#elif defined(CONFIG_AUFS_BRANCH_MAX_32767)
+#define AUFS_BRANCH_MAX 32767
+#endif
+#endif
+
+#ifdef __KERNEL__
+#ifndef AUFS_BRANCH_MAX
+#error unknown CONFIG_AUFS_BRANCH_MAX value
+#endif
+#endif /* __KERNEL__ */
+
+/* ---------------------------------------------------------------------- */
+
+#define AUFS_FSTYPE		AUFS_NAME
+
+#define AUFS_ROOT_INO		2
+#define AUFS_FIRST_INO		11
+
+#define AUFS_WH_PFX		".wh."
+#define AUFS_WH_PFX_LEN		((int)sizeof(AUFS_WH_PFX) - 1)
+#define AUFS_WH_TMP_LEN		4
+/* a limit for rmdir/rename a dir and copyup */
+#define AUFS_MAX_NAMELEN	(NAME_MAX \
+				- AUFS_WH_PFX_LEN * 2	/* doubly whiteouted */\
+				- 1			/* dot */\
+				- AUFS_WH_TMP_LEN)	/* hex */
+#define AUFS_XINO_FNAME		"." AUFS_NAME ".xino"
+#define AUFS_XINO_DEFPATH	"/tmp/" AUFS_XINO_FNAME
+#define AUFS_XINO_DEF_SEC	30 /* seconds */
+#define AUFS_XINO_DEF_TRUNC	45 /* percentage */
+#define AUFS_DIRWH_DEF		3
+#define AUFS_RDCACHE_DEF	10 /* seconds */
+#define AUFS_RDCACHE_MAX	3600 /* seconds */
+#define AUFS_RDBLK_DEF		512 /* bytes */
+#define AUFS_RDHASH_DEF		32
+#define AUFS_WKQ_NAME		AUFS_NAME "d"
+#define AUFS_MFS_DEF_SEC	30 /* seconds */
+#define AUFS_MFS_MAX_SEC	3600 /* seconds */
+#define AUFS_FHSM_CACHE_DEF_SEC	30 /* seconds */
+#define AUFS_PLINK_WARN		50 /* number of plinks in a single bucket */
+
+/* pseudo-link maintenace under /proc */
+#define AUFS_PLINK_MAINT_NAME	"plink_maint"
+#define AUFS_PLINK_MAINT_DIR	"fs/" AUFS_NAME
+#define AUFS_PLINK_MAINT_PATH	AUFS_PLINK_MAINT_DIR "/" AUFS_PLINK_MAINT_NAME
+
+#define AUFS_DIROPQ_NAME	AUFS_WH_PFX ".opq" /* whiteouted doubly */
+#define AUFS_WH_DIROPQ		AUFS_WH_PFX AUFS_DIROPQ_NAME
+
+#define AUFS_BASE_NAME		AUFS_WH_PFX AUFS_NAME
+#define AUFS_PLINKDIR_NAME	AUFS_WH_PFX "plnk"
+#define AUFS_ORPHDIR_NAME	AUFS_WH_PFX "orph"
+
+/* doubly whiteouted */
+#define AUFS_WH_BASE		AUFS_WH_PFX AUFS_BASE_NAME
+#define AUFS_WH_PLINKDIR	AUFS_WH_PFX AUFS_PLINKDIR_NAME
+#define AUFS_WH_ORPHDIR		AUFS_WH_PFX AUFS_ORPHDIR_NAME
+
+/* branch permissions and attributes */
+#define AUFS_BRPERM_RW		"rw"
+#define AUFS_BRPERM_RO		"ro"
+#define AUFS_BRPERM_RR		"rr"
+#define AUFS_BRATTR_COO_REG	"coo_reg"
+#define AUFS_BRATTR_COO_ALL	"coo_all"
+#define AUFS_BRATTR_FHSM	"fhsm"
+#define AUFS_BRATTR_UNPIN	"unpin"
+#define AUFS_BRATTR_ICEX	"icex"
+#define AUFS_BRATTR_ICEX_SEC	"icexsec"
+#define AUFS_BRATTR_ICEX_SYS	"icexsys"
+#define AUFS_BRATTR_ICEX_TR	"icextr"
+#define AUFS_BRATTR_ICEX_USR	"icexusr"
+#define AUFS_BRATTR_ICEX_OTH	"icexoth"
+#define AUFS_BRRATTR_WH		"wh"
+#define AUFS_BRWATTR_NLWH	"nolwh"
+#define AUFS_BRWATTR_MOO	"moo"
+
+#define AuBrPerm_RW		1		/* writable, hardlinkable wh */
+#define AuBrPerm_RO		(1 << 1)	/* readonly */
+#define AuBrPerm_RR		(1 << 2)	/* natively readonly */
+#define AuBrPerm_Mask		(AuBrPerm_RW | AuBrPerm_RO | AuBrPerm_RR)
+
+#define AuBrAttr_COO_REG	(1 << 3)	/* copy-up on open */
+#define AuBrAttr_COO_ALL	(1 << 4)
+#define AuBrAttr_COO_Mask	(AuBrAttr_COO_REG | AuBrAttr_COO_ALL)
+
+#define AuBrAttr_FHSM		(1 << 5)	/* file-based hsm */
+#define AuBrAttr_UNPIN		(1 << 6)	/* rename-able top dir of
+						   branch. meaningless since
+						   linux-3.18-rc1 */
+
+/* ignore error in copying XATTR */
+#define AuBrAttr_ICEX_SEC	(1 << 7)
+#define AuBrAttr_ICEX_SYS	(1 << 8)
+#define AuBrAttr_ICEX_TR	(1 << 9)
+#define AuBrAttr_ICEX_USR	(1 << 10)
+#define AuBrAttr_ICEX_OTH	(1 << 11)
+#define AuBrAttr_ICEX		(AuBrAttr_ICEX_SEC	\
+				 | AuBrAttr_ICEX_SYS	\
+				 | AuBrAttr_ICEX_TR	\
+				 | AuBrAttr_ICEX_USR	\
+				 | AuBrAttr_ICEX_OTH)
+
+#define AuBrRAttr_WH		(1 << 12)	/* whiteout-able */
+#define AuBrRAttr_Mask		AuBrRAttr_WH
+
+#define AuBrWAttr_NoLinkWH	(1 << 13)	/* un-hardlinkable whiteouts */
+#define AuBrWAttr_MOO		(1 << 14)	/* move-up on open */
+#define AuBrWAttr_Mask		(AuBrWAttr_NoLinkWH | AuBrWAttr_MOO)
+
+#define AuBrAttr_CMOO_Mask	(AuBrAttr_COO_Mask | AuBrWAttr_MOO)
+
+/* #warning test userspace */
+#ifdef __KERNEL__
+#ifndef CONFIG_AUFS_FHSM
+#undef AuBrAttr_FHSM
+#define AuBrAttr_FHSM		0
+#endif
+#ifndef CONFIG_AUFS_XATTR
+#undef	AuBrAttr_ICEX
+#define AuBrAttr_ICEX		0
+#undef	AuBrAttr_ICEX_SEC
+#define AuBrAttr_ICEX_SEC	0
+#undef	AuBrAttr_ICEX_SYS
+#define AuBrAttr_ICEX_SYS	0
+#undef	AuBrAttr_ICEX_TR
+#define AuBrAttr_ICEX_TR	0
+#undef	AuBrAttr_ICEX_USR
+#define AuBrAttr_ICEX_USR	0
+#undef	AuBrAttr_ICEX_OTH
+#define AuBrAttr_ICEX_OTH	0
+#endif
+#endif
+
+/* the longest combination */
+/* AUFS_BRATTR_ICEX and AUFS_BRATTR_ICEX_TR don't affect here */
+#define AuBrPermStrSz	sizeof(AUFS_BRPERM_RW			\
+			       "+" AUFS_BRATTR_COO_REG		\
+			       "+" AUFS_BRATTR_FHSM		\
+			       "+" AUFS_BRATTR_UNPIN		\
+			       "+" AUFS_BRATTR_ICEX_SEC		\
+			       "+" AUFS_BRATTR_ICEX_SYS		\
+			       "+" AUFS_BRATTR_ICEX_USR		\
+			       "+" AUFS_BRATTR_ICEX_OTH		\
+			       "+" AUFS_BRWATTR_NLWH)
+
+typedef struct {
+	char a[AuBrPermStrSz];
+} au_br_perm_str_t;
+
+static inline int au_br_writable(int brperm)
+{
+	return brperm & AuBrPerm_RW;
+}
+
+static inline int au_br_whable(int brperm)
+{
+	return brperm & (AuBrPerm_RW | AuBrRAttr_WH);
+}
+
+static inline int au_br_wh_linkable(int brperm)
+{
+	return !(brperm & AuBrWAttr_NoLinkWH);
+}
+
+static inline int au_br_cmoo(int brperm)
+{
+	return brperm & AuBrAttr_CMOO_Mask;
+}
+
+static inline int au_br_fhsm(int brperm)
+{
+	return brperm & AuBrAttr_FHSM;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ioctl */
+enum {
+	/* readdir in userspace */
+	AuCtl_RDU,
+	AuCtl_RDU_INO,
+
+	AuCtl_WBR_FD,	/* pathconf wrapper */
+	AuCtl_IBUSY,	/* busy inode */
+	AuCtl_MVDOWN,	/* move-down */
+	AuCtl_BR,	/* info about branches */
+	AuCtl_FHSM_FD	/* connection for fhsm */
+};
+
+/* borrowed from linux/include/linux/kernel.h */
+#ifndef ALIGN
+#define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
+#endif
+
+/* borrowed from linux/include/linux/compiler-gcc3.h */
+#ifndef __aligned
+#define __aligned(x)			__attribute__((aligned(x)))
+#endif
+
+#ifdef __KERNEL__
+#ifndef __packed
+#define __packed			__attribute__((packed))
+#endif
+#endif
+
+struct au_rdu_cookie {
+	uint64_t	h_pos;
+	int16_t		bindex;
+	uint8_t		flags;
+	uint8_t		pad;
+	uint32_t	generation;
+} __aligned(8);
+
+struct au_rdu_ent {
+	uint64_t	ino;
+	int16_t		bindex;
+	uint8_t		type;
+	uint8_t		nlen;
+	uint8_t		wh;
+	char		name[0];
+} __aligned(8);
+
+static inline int au_rdu_len(int nlen)
+{
+	/* include the terminating NULL */
+	return ALIGN(sizeof(struct au_rdu_ent) + nlen + 1,
+		     sizeof(uint64_t));
+}
+
+union au_rdu_ent_ul {
+	struct au_rdu_ent __user	*e;
+	uint64_t			ul;
+};
+
+enum {
+	AufsCtlRduV_SZ,
+	AufsCtlRduV_End
+};
+
+struct aufs_rdu {
+	/* input */
+	union {
+		uint64_t	sz;	/* AuCtl_RDU */
+		uint64_t	nent;	/* AuCtl_RDU_INO */
+	};
+	union au_rdu_ent_ul	ent;
+	uint16_t		verify[AufsCtlRduV_End];
+
+	/* input/output */
+	uint32_t		blk;
+
+	/* output */
+	union au_rdu_ent_ul	tail;
+	/* number of entries which were added in a single call */
+	uint64_t		rent;
+	uint8_t			full;
+	uint8_t			shwh;
+
+	struct au_rdu_cookie	cookie;
+} __aligned(8);
+
+/* ---------------------------------------------------------------------- */
+
+struct aufs_wbr_fd {
+	uint32_t	oflags;
+	int16_t		brid;
+} __aligned(8);
+
+/* ---------------------------------------------------------------------- */
+
+struct aufs_ibusy {
+	uint64_t	ino, h_ino;
+	int16_t		bindex;
+} __aligned(8);
+
+/* ---------------------------------------------------------------------- */
+
+/* error code for move-down */
+/* the actual message strings are implemented in aufs-util.git */
+enum {
+	EAU_MVDOWN_OPAQUE = 1,
+	EAU_MVDOWN_WHITEOUT,
+	EAU_MVDOWN_UPPER,
+	EAU_MVDOWN_BOTTOM,
+	EAU_MVDOWN_NOUPPER,
+	EAU_MVDOWN_NOLOWERBR,
+	EAU_Last
+};
+
+/* flags for move-down */
+#define AUFS_MVDOWN_DMSG	1
+#define AUFS_MVDOWN_OWLOWER	(1 << 1)	/* overwrite lower */
+#define AUFS_MVDOWN_KUPPER	(1 << 2)	/* keep upper */
+#define AUFS_MVDOWN_ROLOWER	(1 << 3)	/* do even if lower is RO */
+#define AUFS_MVDOWN_ROLOWER_R	(1 << 4)	/* did on lower RO */
+#define AUFS_MVDOWN_ROUPPER	(1 << 5)	/* do even if upper is RO */
+#define AUFS_MVDOWN_ROUPPER_R	(1 << 6)	/* did on upper RO */
+#define AUFS_MVDOWN_BRID_UPPER	(1 << 7)	/* upper brid */
+#define AUFS_MVDOWN_BRID_LOWER	(1 << 8)	/* lower brid */
+#define AUFS_MVDOWN_FHSM_LOWER	(1 << 9)	/* find fhsm attr for lower */
+#define AUFS_MVDOWN_STFS	(1 << 10)	/* req. stfs */
+#define AUFS_MVDOWN_STFS_FAILED	(1 << 11)	/* output: stfs is unusable */
+#define AUFS_MVDOWN_BOTTOM	(1 << 12)	/* output: no more lowers */
+
+/* index for move-down */
+enum {
+	AUFS_MVDOWN_UPPER,
+	AUFS_MVDOWN_LOWER,
+	AUFS_MVDOWN_NARRAY
+};
+
+/*
+ * additional info of move-down
+ * number of free blocks and inodes.
+ * subset of struct kstatfs, but smaller and always 64bit.
+ */
+struct aufs_stfs {
+	uint64_t	f_blocks;
+	uint64_t	f_bavail;
+	uint64_t	f_files;
+	uint64_t	f_ffree;
+};
+
+struct aufs_stbr {
+	int16_t			brid;	/* optional input */
+	int16_t			bindex;	/* output */
+	struct aufs_stfs	stfs;	/* output when AUFS_MVDOWN_STFS set */
+} __aligned(8);
+
+struct aufs_mvdown {
+	uint32_t		flags;			/* input/output */
+	struct aufs_stbr	stbr[AUFS_MVDOWN_NARRAY]; /* input/output */
+	int8_t			au_errno;		/* output */
+} __aligned(8);
+
+/* ---------------------------------------------------------------------- */
+
+union aufs_brinfo {
+	/* PATH_MAX may differ between kernel-space and user-space */
+	char	_spacer[4096];
+	struct {
+		int16_t	id;
+		int	perm;
+		char	path[0];
+	};
+} __aligned(8);
+
+/* ---------------------------------------------------------------------- */
+
+#define AuCtlType		'A'
+#define AUFS_CTL_RDU		_IOWR(AuCtlType, AuCtl_RDU, struct aufs_rdu)
+#define AUFS_CTL_RDU_INO	_IOWR(AuCtlType, AuCtl_RDU_INO, struct aufs_rdu)
+#define AUFS_CTL_WBR_FD		_IOW(AuCtlType, AuCtl_WBR_FD, \
+				     struct aufs_wbr_fd)
+#define AUFS_CTL_IBUSY		_IOWR(AuCtlType, AuCtl_IBUSY, struct aufs_ibusy)
+#define AUFS_CTL_MVDOWN		_IOWR(AuCtlType, AuCtl_MVDOWN, \
+				      struct aufs_mvdown)
+#define AUFS_CTL_BRINFO		_IOW(AuCtlType, AuCtl_BR, union aufs_brinfo)
+#define AUFS_CTL_FHSM_FD	_IOW(AuCtlType, AuCtl_FHSM_FD, int)
+
+#endif /* __AUFS_TYPE_H__ */
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index f095155..f95de72 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -27,6 +27,8 @@
 #define NETLINK_ECRYPTFS	19
 #define NETLINK_RDMA		20
 #define NETLINK_CRYPTO		21	/* Crypto layer */
+#define NETLINK_TOI_USERUI	22	/* TuxOnIce's userui */
+#define NETLINK_TOI_USM		23	/* Userspace storage manager */
 
 #define NETLINK_INET_DIAG	NETLINK_SOCK_DIAG
 
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index cc89dde..f63e1cd 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -37,9 +37,16 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
-/* SCHED_ISO: reserved but not implemented yet */
+/* SCHED_ISO: Implemented on BFS only */
 #define SCHED_IDLE		5
+#ifdef CONFIG_SCHED_BFS
+#define SCHED_ISO		4
+#define SCHED_IDLEPRIO		SCHED_IDLE
+#define SCHED_MAX		(SCHED_IDLEPRIO)
+#define SCHED_RANGE(policy)	((policy) <= SCHED_MAX)
+#else /* CONFIG_SCHED_BFS */
 #define SCHED_DEADLINE		6
+#endif /* CONFIG_SCHED_BFS */
 
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 65a77b0..9a0dfc2 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -115,6 +115,9 @@ enum {
 #define TCP_CC_INFO		26	/* Get Congestion Control (optional) info */
 #define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
 #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
+#define TCP_STEALTH		29
+#define TCP_STEALTH_INTEGRITY	30
+#define TCP_STEALTH_INTEGRITY_LEN	31
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h
index 978578b..6d2e54e 100644
--- a/include/uapi/linux/vt.h
+++ b/include/uapi/linux/vt.h
@@ -3,12 +3,26 @@
 
 
 /*
+ * We will make this definition solely for the purpose of making packages
+ * such as splashutils build, because they can not understand that
+ * NR_TTY_DEVICES is defined in the kernel configuration.
+ */
+#ifndef CONFIG_NR_TTY_DEVICES
+#define CONFIG_NR_TTY_DEVICES 63
+#endif
+
+/*
  * These constants are also useful for user-level apps (e.g., VC
  * resizing).
  */
 #define MIN_NR_CONSOLES 1       /* must be at least 1 */
-#define MAX_NR_CONSOLES	63	/* serial lines start at 64 */
-#define MAX_NR_USER_CONSOLES 63	/* must be root to allocate above this */
+
+/*
+ * NR_TTY_DEVICES:
+ * Value MUST be at least 11 and must never be higher then 63
+ */
+#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES		/* serial lines start above this */
+#define MAX_NR_USER_CONSOLES CONFIG_NR_TTY_DEVICES	/* must be root to allocate above this */
 		/* Note: the ioctl VT_GETSTATE does not work for
 		   consoles 16 and higher (since it returns a short) */
 
diff --git a/init/Kconfig b/init/Kconfig
index 2232080..0d6bb5c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -28,6 +28,62 @@ config BUILDTIME_EXTABLE_SORT
 
 menu "General setup"
 
+config PCK_INTERACTIVE
+	bool "Tune kernel for interactivity"
+	default y
+	help
+	  Tunes the kernel for responsiveness at the cost of throughput and power usage.
+
+	  --- Virtual Memory Subsystem ---------------------------
+
+	    Mem dirty before bg writeback..:  10 %  ->  20 %
+	    Mem dirty before sync writeback:  20 %  ->  50 %
+
+	  --- Block Layer ----------------------------------------
+
+	    NCQ Queue Depth................:  31    ->   8
+	    Block Layer Queue Depth........: 128    ->  16
+
+	  --- CPU Scheduler (CFS) --------------------------------
+
+	    Scheduling latency.............:   6    ->   3    ms
+	    Minimal granularity............:   0.75 ->   0.3  ms
+	    Wakeup granularity.............:   1    ->   0.5  ms
+	    CPU migration cost.............:   0.5  ->   0.25 ms
+	    Bandwidth slice size...........:   5    ->   3    ms
+
+	  --- CPU Scheduler (BFS) --------------------------------
+
+	    Scheduling interval............:   6    ->   3    ms
+	    ISO task max realtime use......:  70 %  ->  25 %
+
+	  --- CPU Frequency Scaling ------------------------------
+
+	    Ondemand down scaling factor...:   1    ->  10
+
+config SCHED_BFS
+	bool "BFS cpu scheduler"
+	default n
+	help
+	  The Brain Fuck CPU Scheduler for excellent interactivity and
+	  responsiveness on the desktop and solid scalability on normal
+          hardware and commodity servers. Not recommended for 4096 CPUs.
+
+	  Currently incompatible with the Group CPU scheduler, and RCU TORTURE
+          TEST so these options are disabled.
+
+config SCHED_BFS_AUTOISO
+	bool "Automatically use SCHED_ISO policy for X"
+	depends on SCHED_BFS
+	default n
+	help
+	  Selecting this option will automatically use the SCHED_ISO scheduling
+	  policy for X, resulting in an interactivity boost. This *may* cause
+	  things like skipping sound on audio applications that are not run
+	  as SCHED_ISO.
+
+	  Tasks (including X) can be run as sched_iso manually using schedtool.
+
 config BROKEN
 	bool
 
@@ -335,7 +391,7 @@ choice
 # Kind of a stub config for the pure tick based cputime accounting
 config TICK_CPU_ACCOUNTING
 	bool "Simple tick based cputime accounting"
-	depends on !S390 && !NO_HZ_FULL
+	depends on !S390 && !NO_HZ_FULL && !SCHED_BFS
 	help
 	  This is the basic tick based cputime accounting that maintains
 	  statistics about user, system and idle time spent on per jiffies
@@ -360,6 +416,7 @@ config VIRT_CPU_ACCOUNTING_GEN
 	bool "Full dynticks CPU time accounting"
 	depends on HAVE_CONTEXT_TRACKING
 	depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
+	depends on !SCHED_BFS
 	select VIRT_CPU_ACCOUNTING
 	select CONTEXT_TRACKING
 	help
@@ -694,6 +751,7 @@ config RCU_NOCB_CPU
 	bool "Offload RCU callback processing from boot-selected CPUs"
 	depends on TREE_RCU || PREEMPT_RCU
 	depends on RCU_EXPERT || NO_HZ_FULL
+	depends on !SCHED_BFS
 	default n
 	help
 	  Use this option to reduce OS jitter for aggressive HPC or
@@ -904,6 +962,7 @@ config NUMA_BALANCING
 	depends on ARCH_SUPPORTS_NUMA_BALANCING
 	depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
 	depends on SMP && NUMA && MIGRATION
+	depends on !SCHED_BFS
 	help
 	  This option adds support for automatic NUMA aware memory/task placement.
 	  The mechanism is quite primitive and is based on migrating memory when
@@ -1002,6 +1061,7 @@ config CGROUP_WRITEBACK
 
 menuconfig CGROUP_SCHED
 	bool "CPU controller"
+	depends on !SCHED_BFS
 	default n
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
@@ -1104,6 +1164,7 @@ config CGROUP_DEVICE
 
 config CGROUP_CPUACCT
 	bool "Simple CPU accounting controller"
+	depends on !SCHED_BFS
 	help
 	  Provides a simple controller for monitoring the
 	  total CPU consumed by the tasks in a cgroup.
@@ -1202,6 +1263,7 @@ endif # NAMESPACES
 
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
+	depends on !SCHED_BFS
 	select CGROUPS
 	select CGROUP_SCHED
 	select FAIR_GROUP_SCHED
@@ -1283,13 +1345,36 @@ source "usr/Kconfig"
 
 endif
 
+choice
+	prompt "Code optimization level"
+	default CC_OPTIMIZE_DEFAULT
+	help
+	  Select the optimization flag to pass to the compiler,
+	  affecting kernel size, speed and compilation time.
+
+	  If in doubt, choose the default optimization level.
+
+config CC_OPTIMIZE_DEFAULT
+	bool "Default optimization"
+	help
+	  This option will pass "-O2" to your compiler. This is
+	  the recommended level and the most tested.
+
+config CC_OPTIMIZE_HARDER
+	bool "Optimize harder"
+	help
+	  This option will pass "-O3" to your compiler resulting
+	  in a larger and faster kernel. The more complex
+	  optimizations also increase compilation time and may
+	  affect stability.
+
 config CC_OPTIMIZE_FOR_SIZE
 	bool "Optimize for size"
 	help
-	  Enabling this option will pass "-Os" instead of "-O2" to
-	  your compiler resulting in a smaller kernel.
+	  This option will pass "-Os" to your compiler resulting
+	  in a smaller kernel.
 
-	  If unsure, say N.
+endchoice
 
 config SYSCTL
 	bool
@@ -1693,6 +1778,7 @@ choice
 	   This option allows to select a slab allocator.
 
 config SLAB
+	depends on !SCHED_BFS
 	bool "SLAB"
 	help
 	  The regular slab allocator that is established and known to work
@@ -1710,7 +1796,7 @@ config SLUB
 	   a slab allocator.
 
 config SLOB
-	depends on EXPERT
+	depends on EXPERT && !SCHED_BFS
 	bool "SLOB (Simple Allocator)"
 	help
 	   SLOB replaces the stock allocator with a drastically simpler
diff --git a/init/do_mounts.c b/init/do_mounts.c
index dea5de9..1fed726 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -597,6 +597,8 @@ void __init prepare_namespace(void)
 	if (is_floppy && rd_doload && rd_load_disk(0))
 		ROOT_DEV = Root_RAM0;
 
+	check_resume_attempted();
+
 	mount_root();
 out:
 	devtmpfs_mount("dev");
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index a1000ca..56b5a0c 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -15,6 +15,7 @@
 #include <linux/romfs_fs.h>
 #include <linux/initrd.h>
 #include <linux/sched.h>
+#include <linux/suspend.h>
 #include <linux/freezer.h>
 #include <linux/kmod.h>
 
@@ -79,6 +80,11 @@ static void __init handle_initrd(void)
 
 	current->flags &= ~PF_FREEZER_SKIP;
 
+	if (!resume_attempted)
+		printk(KERN_ERR "TuxOnIce: No attempt was made to resume from "
+				"any image that might exist.\n");
+	clear_toi_state(TOI_BOOT_TIME);
+
 	/* move initrd to rootfs' /old */
 	sys_mount("..", ".", NULL, MS_MOVE, NULL);
 	/* switch root and cwd back to / of rootfs */
diff --git a/init/main.c b/init/main.c
index 58c9e37..0551060 100644
--- a/init/main.c
+++ b/init/main.c
@@ -808,7 +808,6 @@ int __init_or_module do_one_initcall(initcall_t fn)
 	return ret;
 }
 
-
 extern initcall_t __initcall_start[];
 extern initcall_t __initcall0_start[];
 extern initcall_t __initcall1_start[];
@@ -945,6 +944,8 @@ static int __ref kernel_init(void *unused)
 
 	rcu_end_inkernel_boot();
 
+	print_scheduler_version();
+
 	if (ramdisk_execute_command) {
 		ret = run_init_process(ramdisk_execute_command);
 		if (!ret)
diff --git a/ipc/shm.c b/ipc/shm.c
index 331fc1b..12934ca 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -578,7 +578,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 		if  ((shmflg & SHM_NORESERVE) &&
 				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
 			acctflag = VM_NORESERVE;
-		file = shmem_kernel_file_setup(name, size, acctflag);
+		file = shmem_kernel_file_setup(name, size, acctflag, 0);
 	}
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index fc1ef73..6697a3d 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -710,7 +710,7 @@ kdb_printit:
 			}
 		}
 		while (c) {
-			c->write(c, cp, retlen - (cp - kdb_buffer));
+			c->write(c, cp, retlen - (cp - kdb_buffer), 7); /* 7 == KERN_DEBUG */
 			touch_nmi_watchdog();
 			c = c->next;
 		}
@@ -774,7 +774,7 @@ kdb_printit:
 			}
 		}
 		while (c) {
-			c->write(c, moreprompt, strlen(moreprompt));
+			c->write(c, moreprompt, strlen(moreprompt), 7); /* 7 == KERN_DEBUG */
 			touch_nmi_watchdog();
 			c = c->next;
 		}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 435c14a..a80d56d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -104,7 +104,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->se.sum_exec_runtime;
+	t3 = tsk_seruntime(tsk);
 
 	d->cpu_count += t1;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 10e0882..ac8f9b7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -133,7 +133,7 @@ static void __exit_signal(struct task_struct *tsk)
 	sig->inblock += task_io_get_inblock(tsk);
 	sig->oublock += task_io_get_oublock(tsk);
 	task_io_accounting_add(&sig->ioac, &tsk->ioac);
-	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
+	sig->sum_sched_runtime += tsk_seruntime(tsk);
 	sig->nr_threads--;
 	__unhash_process(tsk, group_dead);
 	write_sequnlock(&sig->stats_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2e391c7..8fbec37 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -138,7 +138,7 @@ static struct kmem_cache *task_struct_cachep;
 
 static inline struct task_struct *alloc_task_struct_node(int node)
 {
-	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
+	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL | ___GFP_TOI_NOTRACK, node);
 }
 
 static inline void free_task_struct(struct task_struct *tsk)
@@ -464,7 +464,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			struct inode *inode = file_inode(file);
 			struct address_space *mapping = file->f_mapping;
 
-			get_file(file);
+			vma_get_file(tmp);
 			if (tmp->vm_flags & VM_DENYWRITE)
 				atomic_dec(&inode->i_writecount);
 			i_mmap_lock_write(mapping);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9ff173d..12d8a8f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -275,7 +275,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct task_struct *task;
 	struct kthread_create_info *create = kmalloc(sizeof(*create),
-						     GFP_KERNEL);
+						     GFP_KERNEL | ___GFP_TOI_NOTRACK);
 
 	if (!create)
 		return ERR_PTR(-ENOMEM);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a028127..b5c30d9 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -47,12 +47,12 @@
  * of times)
  */
 
-#include <linux/latencytop.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
+#include <linux/latencytop.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
 	proc_create("latency_stats", 0644, NULL, &lstats_fops);
 	return 0;
 }
+
+int sysctl_latencytop(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int err;
+
+	err = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (latencytop_enabled)
+		force_schedstat_enabled();
+
+	return err;
+}
 device_initcall(init_lstats_procfs);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 68d3ebc..5f93a3c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -101,6 +101,284 @@ config PM_STD_PARTITION
 	  suspended image to. It will simply pick the first available swap 
 	  device.
 
+menuconfig TOI_CORE
+	bool "Enhanced Hibernation (TuxOnIce)"
+	depends on HIBERNATION
+	default y
+	---help---
+	  TuxOnIce is the 'new and improved' suspend support.
+
+	  See the TuxOnIce home page (tuxonice.net)
+	  for FAQs, HOWTOs and other documentation.
+
+	comment "Image Storage (you need at least one allocator)"
+		depends on TOI_CORE
+
+	config TOI_FILE
+		bool "File Allocator"
+		depends on TOI_CORE
+		default y
+		---help---
+		  This option enables support for storing an image in a
+		  simple file. You might want this if your swap is
+		  sometimes full enough that you don't have enough spare
+		  space to store an image.
+
+	config TOI_SWAP
+		bool "Swap Allocator"
+		depends on TOI_CORE && SWAP
+		default y
+		---help---
+		  This option enables support for storing an image in your
+		  swap space.
+
+	comment "General Options"
+		depends on TOI_CORE
+
+	config TOI_PRUNE
+		bool "Image pruning support"
+		depends on TOI_CORE && CRYPTO && BROKEN
+		default y
+		---help---
+		  This option adds support for using cryptoapi hashing
+		  algorithms to identify pages with the same content. We
+		  then write a much smaller pointer to the first copy of
+		  the data instead of a complete (perhaps compressed)
+                  additional copy.
+
+		  You probably want this, so say Y here.
+
+	comment "No image pruning support available without Cryptoapi support."
+		depends on TOI_CORE && !CRYPTO
+
+	config TOI_CRYPTO
+		bool "Compression support"
+		depends on TOI_CORE && CRYPTO
+		default y
+		---help---
+		  This option adds support for using cryptoapi compression
+		  algorithms. Compression is particularly useful as it can
+		  more than double your suspend and resume speed (depending
+		  upon how well your image compresses).
+
+		  You probably want this, so say Y here.
+
+	comment "No compression support available without Cryptoapi support."
+		depends on TOI_CORE && !CRYPTO
+
+	config TOI_USERUI
+		bool "Userspace User Interface support"
+		depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
+		default y
+		---help---
+		  This option enabled support for a userspace based user interface
+		  to TuxOnIce, which allows you to have a nice display while suspending
+		  and resuming, and also enables features such as pressing escape to
+		  cancel a cycle or interactive debugging.
+
+	config TOI_USERUI_DEFAULT_PATH
+		string "Default userui program location"
+		default "/usr/local/sbin/tuxoniceui_text"
+		depends on TOI_USERUI
+		---help---
+		  This entry allows you to specify a default path to the userui binary.
+
+	config TOI_DEFAULT_IMAGE_SIZE_LIMIT
+		int "Default image size limit"
+		range -2 65536 
+		default "-2"
+		depends on TOI_CORE
+		---help---
+		  This entry allows you to specify a default image size limit. It can
+		  be overridden at run-time using /sys/power/tuxonice/image_size_limit.
+
+	config TOI_KEEP_IMAGE
+		bool "Allow Keep Image Mode"
+		depends on TOI_CORE
+		---help---
+		  This option allows you to keep and image and reuse it. It is intended
+		  __ONLY__ for use with systems where all filesystems are mounted read-
+		  only (kiosks, for example). To use it, compile this option in and boot
+		  normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend.
+		  When you resume, the image will not be removed. You will be unable to turn
+		  off swap partitions (assuming you are using the swap allocator), but future
+		  suspends simply do a power-down. The image can be updated using the
+		  kernel command line parameter suspend_act= to turn off the keep image
+		  bit. Keep image mode is a little less user friendly on purpose - it
+		  should not be used without thought!
+
+	config TOI_INCREMENTAL
+		bool "Incremental Image Support"
+		depends on TOI_CORE && 64BIT && TOI_KEEP_IMAGE
+		default n
+		---help---
+		  This option enables the work in progress toward using the dirty page
+		  tracking to record changes to pages. It is hoped that
+		  this will be an initial step toward implementing storing just
+		  the differences between consecutive images, which will
+		  increase the amount of storage needed for the image, but also
+		  increase the speed at which writing an image occurs and
+		  reduce the wear and tear on drives.
+
+		  At the moment, all that is implemented is the first step of keeping
+		  an existing image and then comparing it to the contents in memory
+		  (by setting /sys/power/tuxonice/verify_image to 1 and triggering a
+		  (fake) resume) to see what the page change tracking should find to be
+		  different. If you have verify_image set to 1, TuxOnIce will automatically
+		  invalidate the old image when you next try to hibernate, so there's no
+		  greater chance of disk corruption than normal.
+
+	comment "No incremental image support available without Keep Image support."
+		depends on TOI_CORE && !TOI_KEEP_IMAGE && 64BIT
+
+	config TOI_REPLACE_SWSUSP
+		bool "Replace swsusp by default"
+		default y
+		depends on TOI_CORE
+		---help---
+		  TuxOnIce can replace swsusp. This option makes that the default state,
+		  requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
+		  to use the vanilla kernel functionality. Note that your initrd/ramfs will
+		  need to do this before trying to resume, too.
+		  With overriding swsusp enabled, echoing disk  to /sys/power/state will
+		  start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
+		  the swap and file allocators are compiled in, the swap allocator will be
+		  used by default.
+
+	config TOI_IGNORE_LATE_INITCALL
+		bool "Wait for initrd/ramfs to run, by default"
+		default n
+		depends on TOI_CORE
+		---help---
+		  When booting, TuxOnIce can check for an image and start to resume prior
+		  to any initrd/ramfs running (via a late initcall).
+
+		  If you don't have an initrd/ramfs, this is what you want to happen -
+		  otherwise you won't be able to safely resume. You should set this option
+		  to 'No'.
+
+		  If, however, you want your initrd/ramfs to run anyway before resuming,
+		  you need to tell TuxOnIce to ignore that earlier opportunity to resume.
+		  This can be done either by using this compile time option, or by
+		  overriding this option with the boot-time parameter toi_initramfs_resume_only=1.
+
+		  Note that if TuxOnIce can't resume at the earlier opportunity, the
+		  value of this option won't matter - the initramfs/initrd (if any) will
+		  run anyway.
+
+	menuconfig TOI_CLUSTER
+		bool "Cluster support"
+		default n
+		depends on TOI_CORE && NET && BROKEN
+		---help---
+		  Support for linking multiple machines in a cluster so that they suspend
+		  and resume together.
+
+	config TOI_DEFAULT_CLUSTER_INTERFACE
+		string "Default cluster interface"
+		depends on TOI_CLUSTER
+		---help---
+		  The default interface on which to communicate with other nodes in
+		  the cluster.
+
+		  If no value is set here, cluster support will be disabled by default.
+
+	config TOI_DEFAULT_CLUSTER_KEY
+		string "Default cluster key"
+		default "Default"
+		depends on TOI_CLUSTER
+		---help---
+		  The default key used by this node. All nodes in the same cluster
+		  have the same key. Multiple clusters may coexist on the same lan
+		  by using different values for this key.
+
+	config TOI_CLUSTER_IMAGE_TIMEOUT
+		int "Timeout when checking for image"
+		default 15
+		depends on TOI_CLUSTER
+		---help---
+		  Timeout (seconds) before continuing to boot when waiting to see
+		  whether other nodes might have an image. Set to -1 to wait
+		  indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue
+		  booting sooner than this timeout.
+
+	config TOI_CLUSTER_WAIT_UNTIL_NODES
+		int "Nodes without image before continuing"
+		default 0
+		depends on TOI_CLUSTER
+		---help---
+		  When booting and no image is found, we wait to see if other nodes
+		  have an image before continuing to boot. This value lets us
+		  continue after seeing a certain number of nodes without an image,
+		  instead of continuing to wait for the timeout. Set to 0 to only
+		  use the timeout.
+
+	config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
+		string "Default pre-hibernate script"
+		depends on TOI_CLUSTER
+		---help---
+		  The default script to be called when starting to hibernate.
+
+	config TOI_DEFAULT_CLUSTER_POST_HIBERNATE
+		string "Default post-hibernate script"
+		depends on TOI_CLUSTER
+		---help---
+		  The default script to be called after resuming from hibernation.
+
+	config TOI_DEFAULT_WAIT
+		int "Default waiting time for emergency boot messages"
+		default "25"
+		range -1 32768
+		depends on TOI_CORE
+		help
+		  TuxOnIce can display warnings very early in the process of resuming,
+		  if (for example) it appears that you have booted a kernel that doesn't
+		  match an image on disk. It can then give you the opportunity to either
+		  continue booting that kernel, or reboot the machine. This option can be
+		  used to control how long to wait in such circumstances. -1 means wait
+		  forever. 0 means don't wait at all (do the default action, which will
+		  generally be to continue booting and remove the image). Values of 1 or
+		  more indicate a number of seconds (up to 255) to wait before doing the
+		  default.
+
+	config  TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE
+		int "Default extra pages allowance"
+		default "2000"
+		range 500 32768
+		depends on TOI_CORE
+		help
+		  This value controls the default for the allowance TuxOnIce makes for
+		  drivers to allocate extra memory during the atomic copy. The default
+		  value of 2000 will be okay in most cases. If you are using
+		  DRI, the easiest way to find what value to use is to try to hibernate
+		  and look at how many pages were actually needed in the sysfs entry
+		  /sys/power/tuxonice/debug_info (first number on the last line), adding
+		  a little extra because the value is not always the same.
+
+	config TOI_CHECKSUM
+		bool "Checksum pageset2"
+		default n
+		depends on TOI_CORE
+		select CRYPTO
+		select CRYPTO_ALGAPI
+		select CRYPTO_MD4
+		---help---
+		  Adds support for checksumming pageset2 pages, to ensure you really get an
+		  atomic copy. Since some filesystems (XFS especially) change metadata even
+		  when there's no other activity, we need this to check for pages that have
+		  been changed while we were saving the page cache. If your debugging output
+		  always says no pages were resaved, you may be able to safely disable this
+		  option.
+
+config TOI
+	bool
+	depends on TOI_CORE!=n
+	default y
+
+config TOI_ZRAM_SUPPORT
+	def_bool y
+	depends on TOI && ZRAM!=n
+
 config PM_SLEEP
 	def_bool y
 	depends on SUSPEND || HIBERNATE_CALLBACKS
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index cb880a1..82c4795 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,38 @@
 
 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
 
+tuxonice_core-y := tuxonice_modules.o
+
+obj-$(CONFIG_TOI)		+= tuxonice_builtin.o
+obj-$(CONFIG_TOI_INCREMENTAL)   += tuxonice_incremental.o \
+    tuxonice_copy_before_write.o
+
+tuxonice_core-$(CONFIG_PM_DEBUG)	+= tuxonice_alloc.o
+
+# Compile these in after allocation debugging, if used.
+
+tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \
+		tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \
+		tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \
+		tuxonice_power_off.o tuxonice_atomic_copy.o
+
+tuxonice_core-$(CONFIG_TOI_CHECKSUM)	+= tuxonice_checksum.o
+
+tuxonice_core-$(CONFIG_NET)	+= tuxonice_storage.o tuxonice_netlink.o
+
+obj-$(CONFIG_TOI_CORE)		+= tuxonice_core.o
+obj-$(CONFIG_TOI_PRUNE)		+= tuxonice_prune.o
+obj-$(CONFIG_TOI_CRYPTO)	+= tuxonice_compress.o
+
+tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \
+		tuxonice_bio_signature.o
+
+obj-$(CONFIG_TOI_SWAP)		+= tuxonice_bio.o tuxonice_swap.o
+obj-$(CONFIG_TOI_FILE)		+= tuxonice_bio.o tuxonice_file.o
+obj-$(CONFIG_TOI_CLUSTER)	+= tuxonice_cluster.o
+
+obj-$(CONFIG_TOI_USERUI)	+= tuxonice_userui.o
+
 obj-y				+= qos.o
 obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b7dd571..ba2dda5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -31,7 +31,7 @@
 #include <linux/ktime.h>
 #include <trace/events/power.h>
 
-#include "power.h"
+#include "tuxonice.h"
 
 
 static int nocompress;
@@ -39,7 +39,7 @@ static int noresume;
 static int nohibernate;
 static int resume_wait;
 static unsigned int resume_delay;
-static char resume_file[256] = CONFIG_PM_STD_PARTITION;
+char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
 __visible int in_suspend __nosavedata;
@@ -123,7 +123,7 @@ static int hibernation_test(int level) { return 0; }
  * platform_begin - Call platform to start hibernation.
  * @platform_mode: Whether or not to use the platform driver.
  */
-static int platform_begin(int platform_mode)
+int platform_begin(int platform_mode)
 {
 	return (platform_mode && hibernation_ops) ?
 		hibernation_ops->begin() : 0;
@@ -133,7 +133,7 @@ static int platform_begin(int platform_mode)
  * platform_end - Call platform to finish transition to the working state.
  * @platform_mode: Whether or not to use the platform driver.
  */
-static void platform_end(int platform_mode)
+void platform_end(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->end();
@@ -147,7 +147,7 @@ static void platform_end(int platform_mode)
  * if so configured, and return an error code if that fails.
  */
 
-static int platform_pre_snapshot(int platform_mode)
+int platform_pre_snapshot(int platform_mode)
 {
 	return (platform_mode && hibernation_ops) ?
 		hibernation_ops->pre_snapshot() : 0;
@@ -162,7 +162,7 @@ static int platform_pre_snapshot(int platform_mode)
  *
  * This routine is called on one CPU with interrupts disabled.
  */
-static void platform_leave(int platform_mode)
+void platform_leave(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->leave();
@@ -177,7 +177,7 @@ static void platform_leave(int platform_mode)
  *
  * This routine must be called after platform_prepare().
  */
-static void platform_finish(int platform_mode)
+void platform_finish(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->finish();
@@ -193,7 +193,7 @@ static void platform_finish(int platform_mode)
  * If the restore fails after this function has been called,
  * platform_restore_cleanup() must be called.
  */
-static int platform_pre_restore(int platform_mode)
+int platform_pre_restore(int platform_mode)
 {
 	return (platform_mode && hibernation_ops) ?
 		hibernation_ops->pre_restore() : 0;
@@ -210,7 +210,7 @@ static int platform_pre_restore(int platform_mode)
  * function must be called too, regardless of the result of
  * platform_pre_restore().
  */
-static void platform_restore_cleanup(int platform_mode)
+void platform_restore_cleanup(int platform_mode)
 {
 	if (platform_mode && hibernation_ops)
 		hibernation_ops->restore_cleanup();
@@ -220,7 +220,7 @@ static void platform_restore_cleanup(int platform_mode)
  * platform_recover - Recover from a failure to suspend devices.
  * @platform_mode: Whether or not to use the platform driver.
  */
-static void platform_recover(int platform_mode)
+void platform_recover(int platform_mode)
 {
 	if (platform_mode && hibernation_ops && hibernation_ops->recover)
 		hibernation_ops->recover();
@@ -649,6 +649,9 @@ int hibernate(void)
 {
 	int error;
 
+	if (test_action_state(TOI_REPLACE_SWSUSP))
+		return try_tuxonice_hibernate();
+
 	if (!hibernation_available()) {
 		pr_debug("PM: Hibernation not available.\n");
 		return -EPERM;
@@ -738,11 +741,19 @@ int hibernate(void)
  * attempts to recover gracefully and make the kernel return to the normal mode
  * of operation.
  */
-static int software_resume(void)
+int software_resume(void)
 {
 	int error;
 	unsigned int flags;
 
+	resume_attempted = 1;
+
+	/*
+	 * We can't know (until an image header - if any - is loaded), whether
+	 * we did override swsusp. We therefore ensure that both are tried.
+	 */
+	try_tuxonice_resume();
+
 	/*
 	 * If the user said "noresume".. bail out early.
 	 */
@@ -1129,6 +1140,7 @@ static int __init hibernate_setup(char *str)
 static int __init noresume_setup(char *str)
 {
 	noresume = 1;
+	set_toi_state(TOI_NORESUME_SPECIFIED);
 	return 1;
 }
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index efe1b3b..bf4d922 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -36,8 +36,12 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 	return arch_hibernation_header_restore(info) ?
 			"architecture specific data" : NULL;
 }
+#else
+extern char *check_image_kernel(struct swsusp_info *info);
 #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+extern int init_header(struct swsusp_info *info);
 
+extern char resume_file[256];
 /*
  * Keep some memory free so that I/O operations can succeed without paging
  * [Might this be more than 4 MB?]
@@ -77,6 +81,7 @@ static struct kobj_attribute _name##_attr = {	\
 	.store	= _name##_store,		\
 }
 
+extern struct pbe *restore_pblist;
 #define power_attr_ro(_name) \
 static struct kobj_attribute _name##_attr = {	\
 	.attr	= {				\
@@ -269,6 +274,31 @@ static inline void suspend_thaw_processes(void)
 }
 #endif
 
+extern struct page *saveable_page(struct zone *z, unsigned long p);
+#ifdef CONFIG_HIGHMEM
+struct page *saveable_highmem_page(struct zone *z, unsigned long p);
+#else
+static
+inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+{
+	return NULL;
+}
+#endif
+
+#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
+extern struct list_head nosave_regions;
+
+/**
+ *	This structure represents a range of page frames the contents of which
+ *	should not be saved during the suspend.
+ */
+
+struct nosave_region {
+	struct list_head list;
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+};
+
 #ifdef CONFIG_PM_AUTOSLEEP
 
 /* kernel/power/autosleep.c */
@@ -295,3 +325,10 @@ extern int pm_wake_lock(const char *buf);
 extern int pm_wake_unlock(const char *buf);
 
 #endif /* !CONFIG_PM_WAKELOCKS */
+
+#ifdef CONFIG_TOI
+unsigned long toi_get_nonconflicting_page(void);
+#define BM_END_OF_MAP	(~0UL)
+#else
+#define toi_get_nonconflicting_page() (0)
+#endif
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3a97060..542163a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -36,6 +36,9 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
+#include "tuxonice_modules.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_alloc.h"
 #include "power.h"
 
 static int swsusp_page_is_free(struct page *);
@@ -98,6 +101,9 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 {
 	void *res;
 
+        if (toi_running)
+            return (void *) toi_get_nonconflicting_page();
+
 	res = (void *)get_zeroed_page(gfp_mask);
 	if (safe_needed)
 		while (res && swsusp_page_is_free(virt_to_page(res))) {
@@ -143,6 +149,11 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 
 	page = virt_to_page(addr);
 
+        if (toi_running) {
+            toi__free_page(29, page);
+            return;
+        }
+
 	swsusp_unset_page_forbidden(page);
 	if (clear_nosave_free)
 		swsusp_unset_page_free(page);
@@ -302,13 +313,15 @@ struct bm_position {
 	int node_bit;
 };
 
+#define BM_POSITION_SLOTS (NR_CPUS * 2)
+
 struct memory_bitmap {
 	struct list_head zones;
 	struct linked_page *p_list;	/* list of pages used to store zone
 					 * bitmap objects and bitmap block
 					 * objects
 					 */
-	struct bm_position cur;	/* most recently used bit position */
+	struct bm_position cur[BM_POSITION_SLOTS];    /* most recently used bit position */
 };
 
 /* Functions that operate on memory bitmaps */
@@ -473,16 +486,39 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
 		free_image_page(node->data, clear_nosave_free);
 }
 
-static void memory_bm_position_reset(struct memory_bitmap *bm)
+void memory_bm_position_reset(struct memory_bitmap *bm)
 {
-	bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+    int index;
+
+    for (index = 0; index < BM_POSITION_SLOTS; index++) {
+	bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
 				  list);
-	bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+	bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
 				  struct rtree_node, list);
-	bm->cur.node_pfn = 0;
-	bm->cur.node_bit = 0;
+	bm->cur[index].node_pfn = 0;
+	bm->cur[index].node_bit = 0;
+    }
 }
 
+static void memory_bm_clear_current(struct memory_bitmap *bm, int index);
+unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
+
+/**
+ *      memory_bm_clear
+ *      @param bm - The bitmap to clear
+ *
+ *      Only run while single threaded - locking not needed
+ */
+void memory_bm_clear(struct memory_bitmap *bm)
+{
+    memory_bm_position_reset(bm);
+
+    while (memory_bm_next_pfn(bm, 0) != BM_END_OF_MAP) {
+        memory_bm_clear_current(bm, 0);
+    }
+
+    memory_bm_position_reset(bm);
+}
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
 
 struct mem_extent {
@@ -595,7 +631,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 	}
 
 	bm->p_list = ca.chain;
-	memory_bm_position_reset(bm);
+
+        memory_bm_position_reset(bm);
  Exit:
 	free_mem_extents(&mem_extents);
 	return error;
@@ -631,14 +668,24 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
  *	It walks the radix tree to find the page which contains the bit for
  *	pfn and returns the bit position in **addr and *bit_nr.
  */
-static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
-			      void **addr, unsigned int *bit_nr)
+int memory_bm_find_bit(struct memory_bitmap *bm, int index,
+        unsigned long pfn, void **addr, unsigned int *bit_nr)
 {
 	struct mem_zone_bm_rtree *curr, *zone;
 	struct rtree_node *node;
 	int i, block_nr;
 
-	zone = bm->cur.zone;
+        if (!bm->cur[index].zone) {
+            // Reset
+            bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+                    list);
+            bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+                    struct rtree_node, list);
+            bm->cur[index].node_pfn = 0;
+            bm->cur[index].node_bit = 0;
+        }
+
+	zone = bm->cur[index].zone;
 
 	if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
 		goto zone_found;
@@ -662,8 +709,8 @@ zone_found:
 	 * node for our pfn.
 	 */
 
-	node = bm->cur.node;
-	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+	node = bm->cur[index].node;
+	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur[index].node_pfn)
 		goto node_found;
 
 	node      = zone->rtree;
@@ -680,9 +727,9 @@ zone_found:
 
 node_found:
 	/* Update last position */
-	bm->cur.zone = zone;
-	bm->cur.node = node;
-	bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+	bm->cur[index].zone = zone;
+	bm->cur[index].node = node;
+	bm->cur[index].node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
 
 	/* Set return values */
 	*addr = node->data;
@@ -691,66 +738,66 @@ node_found:
 	return 0;
 }
 
-static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
 	BUG_ON(error);
 	set_bit(bit, addr);
 }
 
-static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
+int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
 	if (!error)
 		set_bit(bit, addr);
 
 	return error;
 }
 
-static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
+void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
 	BUG_ON(error);
 	clear_bit(bit, addr);
 }
 
-static void memory_bm_clear_current(struct memory_bitmap *bm)
+static void memory_bm_clear_current(struct memory_bitmap *bm, int index)
 {
 	int bit;
 
-	bit = max(bm->cur.node_bit - 1, 0);
-	clear_bit(bit, bm->cur.node->data);
+	bit = max(bm->cur[index].node_bit - 1, 0);
+	clear_bit(bit, bm->cur[index].node->data);
 }
 
-static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
+int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 	int error;
 
-	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
 	BUG_ON(error);
 	return test_bit(bit, addr);
 }
 
-static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
+static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
 
-	return !memory_bm_find_bit(bm, pfn, &addr, &bit);
+	return !memory_bm_find_bit(bm, index, pfn, &addr, &bit);
 }
 
 /*
@@ -763,25 +810,25 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
  *
  *	Returns true if there is a next node, false otherwise.
  */
-static bool rtree_next_node(struct memory_bitmap *bm)
+static bool rtree_next_node(struct memory_bitmap *bm, int index)
 {
-	bm->cur.node = list_entry(bm->cur.node->list.next,
+	bm->cur[index].node = list_entry(bm->cur[index].node->list.next,
 				  struct rtree_node, list);
-	if (&bm->cur.node->list != &bm->cur.zone->leaves) {
-		bm->cur.node_pfn += BM_BITS_PER_BLOCK;
-		bm->cur.node_bit  = 0;
+	if (&bm->cur[index].node->list != &bm->cur[index].zone->leaves) {
+		bm->cur[index].node_pfn += BM_BITS_PER_BLOCK;
+		bm->cur[index].node_bit  = 0;
 		touch_softlockup_watchdog();
 		return true;
 	}
 
 	/* No more nodes, goto next zone */
-	bm->cur.zone = list_entry(bm->cur.zone->list.next,
+	bm->cur[index].zone = list_entry(bm->cur[index].zone->list.next,
 				  struct mem_zone_bm_rtree, list);
-	if (&bm->cur.zone->list != &bm->zones) {
-		bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+	if (&bm->cur[index].zone->list != &bm->zones) {
+		bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
 					  struct rtree_node, list);
-		bm->cur.node_pfn = 0;
-		bm->cur.node_bit = 0;
+		bm->cur[index].node_pfn = 0;
+		bm->cur[index].node_bit = 0;
 		return true;
 	}
 
@@ -799,38 +846,29 @@ static bool rtree_next_node(struct memory_bitmap *bm)
  *	It is required to run memory_bm_position_reset() before the
  *	first call to this function.
  */
-static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
+unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index)
 {
 	unsigned long bits, pfn, pages;
 	int bit;
 
+        index += NR_CPUS; /* Iteration state is separated from get/set/test */
+
 	do {
-		pages	  = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
-		bits      = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
-		bit	  = find_next_bit(bm->cur.node->data, bits,
-					  bm->cur.node_bit);
+		pages	  = bm->cur[index].zone->end_pfn - bm->cur[index].zone->start_pfn;
+		bits      = min(pages - bm->cur[index].node_pfn, BM_BITS_PER_BLOCK);
+		bit	  = find_next_bit(bm->cur[index].node->data, bits,
+					  bm->cur[index].node_bit);
 		if (bit < bits) {
-			pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
-			bm->cur.node_bit = bit + 1;
+			pfn = bm->cur[index].zone->start_pfn + bm->cur[index].node_pfn + bit;
+			bm->cur[index].node_bit = bit + 1;
 			return pfn;
 		}
-	} while (rtree_next_node(bm));
+	} while (rtree_next_node(bm, index));
 
 	return BM_END_OF_MAP;
 }
 
-/**
- *	This structure represents a range of page frames the contents of which
- *	should not be saved during the suspend.
- */
-
-struct nosave_region {
-	struct list_head list;
-	unsigned long start_pfn;
-	unsigned long end_pfn;
-};
-
-static LIST_HEAD(nosave_regions);
+LIST_HEAD(nosave_regions);
 
 /**
  *	register_nosave_region - register a range of page frames the contents
@@ -889,37 +927,37 @@ static struct memory_bitmap *free_pages_map;
 void swsusp_set_page_free(struct page *page)
 {
 	if (free_pages_map)
-		memory_bm_set_bit(free_pages_map, page_to_pfn(page));
+		memory_bm_set_bit(free_pages_map, 0, page_to_pfn(page));
 }
 
 static int swsusp_page_is_free(struct page *page)
 {
 	return free_pages_map ?
-		memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
+		memory_bm_test_bit(free_pages_map, 0, page_to_pfn(page)) : 0;
 }
 
 void swsusp_unset_page_free(struct page *page)
 {
 	if (free_pages_map)
-		memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
+		memory_bm_clear_bit(free_pages_map, 0, page_to_pfn(page));
 }
 
 static void swsusp_set_page_forbidden(struct page *page)
 {
 	if (forbidden_pages_map)
-		memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
+		memory_bm_set_bit(forbidden_pages_map, 0, page_to_pfn(page));
 }
 
 int swsusp_page_is_forbidden(struct page *page)
 {
 	return forbidden_pages_map ?
-		memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
+		memory_bm_test_bit(forbidden_pages_map, 0, page_to_pfn(page)) : 0;
 }
 
 static void swsusp_unset_page_forbidden(struct page *page)
 {
 	if (forbidden_pages_map)
-		memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
+		memory_bm_clear_bit(forbidden_pages_map, 0, page_to_pfn(page));
 }
 
 /**
@@ -950,7 +988,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 				 * touch the PFNs for which the error is
 				 * returned anyway.
 				 */
-				mem_bm_set_bit_check(bm, pfn);
+				mem_bm_set_bit_check(bm, 0, pfn);
 			}
 	}
 }
@@ -1078,7 +1116,7 @@ static unsigned int count_free_highmem_pages(void)
  *	We should save the page if it isn't Nosave or NosaveFree, or Reserved,
  *	and it isn't a part of a free chunk of pages.
  */
-static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
+struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 {
 	struct page *page;
 
@@ -1125,11 +1163,6 @@ static unsigned int count_highmem_pages(void)
 	}
 	return n;
 }
-#else
-static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
-{
-	return NULL;
-}
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -1140,7 +1173,7 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
  *	of pages statically defined as 'unsaveable', and it isn't a part of
  *	a free chunk of pages.
  */
-static struct page *saveable_page(struct zone *zone, unsigned long pfn)
+struct page *saveable_page(struct zone *zone, unsigned long pfn)
 {
 	struct page *page;
 
@@ -1278,15 +1311,15 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 		max_zone_pfn = zone_end_pfn(zone);
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (page_is_saveable(zone, pfn))
-				memory_bm_set_bit(orig_bm, pfn);
+				memory_bm_set_bit(orig_bm, 0, pfn);
 	}
 	memory_bm_position_reset(orig_bm);
 	memory_bm_position_reset(copy_bm);
 	for(;;) {
-		pfn = memory_bm_next_pfn(orig_bm);
+		pfn = memory_bm_next_pfn(orig_bm, 0);
 		if (unlikely(pfn == BM_END_OF_MAP))
 			break;
-		copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
+		copy_data_page(memory_bm_next_pfn(copy_bm, 0), pfn);
 	}
 }
 
@@ -1332,8 +1365,8 @@ void swsusp_free(void)
 	memory_bm_position_reset(free_pages_map);
 
 loop:
-	fr_pfn = memory_bm_next_pfn(free_pages_map);
-	fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+	fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
+	fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
 
 	/*
 	 * Find the next bit set in both bitmaps. This is guaranteed to
@@ -1341,16 +1374,16 @@ loop:
 	 */
 	do {
 		if (fb_pfn < fr_pfn)
-			fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+			fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
 		if (fr_pfn < fb_pfn)
-			fr_pfn = memory_bm_next_pfn(free_pages_map);
+			fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
 	} while (fb_pfn != fr_pfn);
 
 	if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
 		struct page *page = pfn_to_page(fr_pfn);
 
-		memory_bm_clear_current(forbidden_pages_map);
-		memory_bm_clear_current(free_pages_map);
+		memory_bm_clear_current(forbidden_pages_map, 0);
+		memory_bm_clear_current(free_pages_map, 0);
 		__free_page(page);
 		goto loop;
 	}
@@ -1385,7 +1418,7 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
 		page = alloc_image_page(mask);
 		if (!page)
 			break;
-		memory_bm_set_bit(&copy_bm, page_to_pfn(page));
+		memory_bm_set_bit(&copy_bm, 0, page_to_pfn(page));
 		if (PageHighMem(page))
 			alloc_highmem++;
 		else
@@ -1481,7 +1514,7 @@ static unsigned long free_unnecessary_pages(void)
 	memory_bm_position_reset(&copy_bm);
 
 	while (to_free_normal > 0 || to_free_highmem > 0) {
-		unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+		unsigned long pfn = memory_bm_next_pfn(&copy_bm, 0);
 		struct page *page = pfn_to_page(pfn);
 
 		if (PageHighMem(page)) {
@@ -1495,7 +1528,7 @@ static unsigned long free_unnecessary_pages(void)
 			to_free_normal--;
 			alloc_normal--;
 		}
-		memory_bm_clear_bit(&copy_bm, pfn);
+		memory_bm_clear_bit(&copy_bm, 0, pfn);
 		swsusp_unset_page_forbidden(page);
 		swsusp_unset_page_free(page);
 		__free_page(page);
@@ -1780,7 +1813,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 		struct page *page;
 
 		page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
-		memory_bm_set_bit(bm, page_to_pfn(page));
+		memory_bm_set_bit(bm, 0, page_to_pfn(page));
 	}
 	return nr_highmem;
 }
@@ -1823,7 +1856,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 			page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
 			if (!page)
 				goto err_out;
-			memory_bm_set_bit(copy_bm, page_to_pfn(page));
+			memory_bm_set_bit(copy_bm, 0, page_to_pfn(page));
 		}
 	}
 
@@ -1838,6 +1871,9 @@ asmlinkage __visible int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
 
+        if (toi_running)
+            return toi_post_context_save();
+
 	printk(KERN_INFO "PM: Creating hibernation image:\n");
 
 	drain_local_pages(NULL);
@@ -1885,7 +1921,7 @@ static int init_header_complete(struct swsusp_info *info)
 	return 0;
 }
 
-static char *check_image_kernel(struct swsusp_info *info)
+char *check_image_kernel(struct swsusp_info *info)
 {
 	if (info->version_code != LINUX_VERSION_CODE)
 		return "kernel version";
@@ -1906,7 +1942,7 @@ unsigned long snapshot_get_image_size(void)
 	return nr_copy_pages + nr_meta_pages + 1;
 }
 
-static int init_header(struct swsusp_info *info)
+int init_header(struct swsusp_info *info)
 {
 	memset(info, 0, sizeof(struct swsusp_info));
 	info->num_physpages = get_num_physpages();
@@ -1928,7 +1964,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 	int j;
 
 	for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
-		buf[j] = memory_bm_next_pfn(bm);
+		buf[j] = memory_bm_next_pfn(bm, 0);
 		if (unlikely(buf[j] == BM_END_OF_MAP))
 			break;
 		/* Save page key for data page (s390 only). */
@@ -1979,7 +2015,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
 	} else {
 		struct page *page;
 
-		page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
+		page = pfn_to_page(memory_bm_next_pfn(&copy_bm, 0));
 		if (PageHighMem(page)) {
 			/* Highmem pages are copied to the buffer,
 			 * because we can't return with a kmapped
@@ -2021,7 +2057,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
 	/* Mark pages that correspond to the "original" pfns as "unsafe" */
 	memory_bm_position_reset(bm);
 	do {
-		pfn = memory_bm_next_pfn(bm);
+		pfn = memory_bm_next_pfn(bm, 0);
 		if (likely(pfn != BM_END_OF_MAP)) {
 			if (likely(pfn_valid(pfn)))
 				swsusp_set_page_free(pfn_to_page(pfn));
@@ -2041,10 +2077,10 @@ duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
 	unsigned long pfn;
 
 	memory_bm_position_reset(src);
-	pfn = memory_bm_next_pfn(src);
+	pfn = memory_bm_next_pfn(src, 0);
 	while (pfn != BM_END_OF_MAP) {
-		memory_bm_set_bit(dst, pfn);
-		pfn = memory_bm_next_pfn(src);
+		memory_bm_set_bit(dst, 0, pfn);
+		pfn = memory_bm_next_pfn(src, 0);
 	}
 }
 
@@ -2095,8 +2131,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 		/* Extract and buffer page key for data page (s390 only). */
 		page_key_memorize(buf + j);
 
-		if (memory_bm_pfn_present(bm, buf[j]))
-			memory_bm_set_bit(bm, buf[j]);
+		if (memory_bm_pfn_present(bm, 0, buf[j]))
+			memory_bm_set_bit(bm, 0, buf[j]);
 		else
 			return -EFAULT;
 	}
@@ -2139,12 +2175,12 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
 	unsigned int cnt = 0;
 
 	memory_bm_position_reset(bm);
-	pfn = memory_bm_next_pfn(bm);
+	pfn = memory_bm_next_pfn(bm, 0);
 	while (pfn != BM_END_OF_MAP) {
 		if (PageHighMem(pfn_to_page(pfn)))
 			cnt++;
 
-		pfn = memory_bm_next_pfn(bm);
+		pfn = memory_bm_next_pfn(bm, 0);
 	}
 	return cnt;
 }
@@ -2189,7 +2225,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
 		page = alloc_page(__GFP_HIGHMEM);
 		if (!swsusp_page_is_free(page)) {
 			/* The page is "safe", set its bit the bitmap */
-			memory_bm_set_bit(bm, page_to_pfn(page));
+			memory_bm_set_bit(bm, 0, page_to_pfn(page));
 			safe_highmem_pages++;
 		}
 		/* Mark the page as allocated */
@@ -2247,7 +2283,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
 
 		/* Copy of the page will be stored in high memory */
 		kaddr = buffer;
-		tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
+		tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm, 0));
 		safe_highmem_pages--;
 		last_highmem_page = tmp;
 		pbe->copy_page = tmp;
@@ -2418,7 +2454,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
 	struct pbe *pbe;
 	struct page *page;
-	unsigned long pfn = memory_bm_next_pfn(bm);
+	unsigned long pfn = memory_bm_next_pfn(bm, 0);
 
 	if (pfn == BM_END_OF_MAP)
 		return ERR_PTR(-EFAULT);
@@ -2605,3 +2641,82 @@ int restore_highmem(void)
 	return 0;
 }
 #endif /* CONFIG_HIGHMEM */
+
+struct memory_bitmap *pageset1_map, *pageset2_map, *free_map, *nosave_map,
+  *pageset1_copy_map, *io_map, *page_resave_map, *compare_map;
+
+int resume_attempted;
+
+int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
+	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
+{
+    int result;
+
+    memory_bm_position_reset(bm);
+
+    do {
+        result = rw_chunk(WRITE, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
+
+        if (result)
+            return result;
+    } while (rtree_next_node(bm, 0));
+    return 0;
+}
+
+int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
+	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
+{
+    int result;
+
+    memory_bm_position_reset(bm);
+
+    do {
+        result = rw_chunk(READ, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
+
+        if (result)
+            return result;
+
+    } while (rtree_next_node(bm, 0));
+    return 0;
+}
+
+int memory_bm_space_needed(struct memory_bitmap *bm)
+{
+    unsigned long bytes = 0;
+
+    memory_bm_position_reset(bm);
+    do {
+        bytes += PAGE_SIZE;
+    } while (rtree_next_node(bm, 0));
+    return bytes;
+}
+
+int toi_alloc_bitmap(struct memory_bitmap **bm)
+{
+    int error;
+    struct memory_bitmap *bm1;
+
+    bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
+    if (!bm1)
+        return -ENOMEM;
+
+    error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
+    if (error) {
+        printk("Error returned - %d.\n", error);
+        kfree(bm1);
+        return -ENOMEM;
+    }
+
+    *bm = bm1;
+    return 0;
+}
+
+void toi_free_bitmap(struct memory_bitmap **bm)
+{
+    if (!*bm)
+        return;
+
+    memory_bm_free(*bm, 0);
+    kfree(*bm);
+    *bm = NULL;
+}
diff --git b/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
new file mode 100644
index 0000000..10b6563
--- /dev/null
+++ b/kernel/power/tuxonice.h
@@ -0,0 +1,260 @@
+/*
+ * kernel/power/tuxonice.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains declarations used throughout swsusp.
+ *
+ */
+
+#ifndef KERNEL_POWER_TOI_H
+#define KERNEL_POWER_TOI_H
+
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/suspend.h>
+#include <linux/fs.h>
+#include <asm/setup.h>
+#include "tuxonice_pageflags.h"
+#include "power.h"
+
+#define TOI_CORE_VERSION "3.3"
+#define        TOI_HEADER_VERSION 3
+#define MY_BOOT_KERNEL_DATA_VERSION 4
+
+struct toi_boot_kernel_data {
+        int version;
+        int size;
+        unsigned long toi_action;
+        unsigned long toi_debug_state;
+        u32 toi_default_console_level;
+        int toi_io_time[2][2];
+        char toi_nosave_commandline[COMMAND_LINE_SIZE];
+        unsigned long pages_used[33];
+        unsigned long incremental_bytes_in;
+        unsigned long incremental_bytes_out;
+        unsigned long compress_bytes_in;
+        unsigned long compress_bytes_out;
+        unsigned long pruned_pages;
+};
+
+extern struct toi_boot_kernel_data toi_bkd;
+
+/* Location of book kernel data struct in kernel being resumed */
+extern unsigned long boot_kernel_data_buffer;
+
+/*                 == Action states ==                 */
+
+enum {
+        TOI_REBOOT,
+        TOI_PAUSE,
+        TOI_LOGALL,
+        TOI_CAN_CANCEL,
+        TOI_KEEP_IMAGE,
+        TOI_FREEZER_TEST,
+        TOI_SINGLESTEP,
+        TOI_PAUSE_NEAR_PAGESET_END,
+        TOI_TEST_FILTER_SPEED,
+        TOI_TEST_BIO,
+        TOI_NO_PAGESET2,
+        TOI_IGNORE_ROOTFS,
+        TOI_REPLACE_SWSUSP,
+        TOI_PAGESET2_FULL,
+        TOI_ABORT_ON_RESAVE_NEEDED,
+        TOI_NO_MULTITHREADED_IO,
+        TOI_NO_DIRECT_LOAD, /* Obsolete */
+        TOI_LATE_CPU_HOTPLUG, /* Obsolete */
+        TOI_GET_MAX_MEM_ALLOCD,
+        TOI_NO_FLUSHER_THREAD,
+        TOI_NO_PS2_IF_UNNEEDED,
+        TOI_POST_RESUME_BREAKPOINT,
+        TOI_NO_READAHEAD,
+        TOI_TRACE_DEBUG_ON,
+        TOI_INCREMENTAL_IMAGE,
+};
+
+extern unsigned long toi_bootflags_mask;
+
+#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action))
+
+/*                 == Result states ==                 */
+
+enum {
+        TOI_ABORTED,
+        TOI_ABORT_REQUESTED,
+        TOI_NOSTORAGE_AVAILABLE,
+        TOI_INSUFFICIENT_STORAGE,
+        TOI_FREEZING_FAILED,
+        TOI_KEPT_IMAGE,
+        TOI_WOULD_EAT_MEMORY,
+        TOI_UNABLE_TO_FREE_ENOUGH_MEMORY,
+        TOI_PM_SEM,
+        TOI_DEVICE_REFUSED,
+        TOI_SYSDEV_REFUSED,
+        TOI_EXTRA_PAGES_ALLOW_TOO_SMALL,
+        TOI_UNABLE_TO_PREPARE_IMAGE,
+        TOI_FAILED_MODULE_INIT,
+        TOI_FAILED_MODULE_CLEANUP,
+        TOI_FAILED_IO,
+        TOI_OUT_OF_MEMORY,
+        TOI_IMAGE_ERROR,
+        TOI_PLATFORM_PREP_FAILED,
+        TOI_CPU_HOTPLUG_FAILED,
+        TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */
+        TOI_RESAVE_NEEDED,
+        TOI_CANT_SUSPEND,
+        TOI_NOTIFIERS_PREPARE_FAILED,
+        TOI_PRE_SNAPSHOT_FAILED,
+        TOI_PRE_RESTORE_FAILED,
+        TOI_USERMODE_HELPERS_ERR,
+        TOI_CANT_USE_ALT_RESUME,
+        TOI_HEADER_TOO_BIG,
+        TOI_WAKEUP_EVENT,
+        TOI_SYSCORE_REFUSED,
+        TOI_DPM_PREPARE_FAILED,
+        TOI_DPM_SUSPEND_FAILED,
+        TOI_NUM_RESULT_STATES        /* Used in printing debug info only */
+};
+
+extern unsigned long toi_result;
+
+#define set_result_state(bit) (test_and_set_bit(bit, &toi_result))
+#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \
+                                test_and_set_bit(bit, &toi_result))
+#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result))
+#define test_result_state(bit) (test_bit(bit, &toi_result))
+
+/*         == Debug sections and levels ==         */
+
+/* debugging levels. */
+enum {
+        TOI_STATUS = 0,
+        TOI_ERROR = 2,
+        TOI_LOW,
+        TOI_MEDIUM,
+        TOI_HIGH,
+        TOI_VERBOSE,
+};
+
+enum {
+        TOI_ANY_SECTION,
+        TOI_EAT_MEMORY,
+        TOI_IO,
+        TOI_HEADER,
+        TOI_WRITER,
+        TOI_MEMORY,
+        TOI_PAGEDIR,
+        TOI_COMPRESS,
+        TOI_BIO,
+};
+
+#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state))
+#define clear_debug_state(bit) \
+        (test_and_clear_bit(bit, &toi_bkd.toi_debug_state))
+#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state))
+
+/*                == Steps in hibernating ==        */
+
+enum {
+        STEP_HIBERNATE_PREPARE_IMAGE,
+        STEP_HIBERNATE_SAVE_IMAGE,
+        STEP_HIBERNATE_POWERDOWN,
+        STEP_RESUME_CAN_RESUME,
+        STEP_RESUME_LOAD_PS1,
+        STEP_RESUME_DO_RESTORE,
+        STEP_RESUME_READ_PS2,
+        STEP_RESUME_GO,
+        STEP_RESUME_ALT_IMAGE,
+        STEP_CLEANUP,
+        STEP_QUIET_CLEANUP
+};
+
+/*                == TuxOnIce states ==
+        (see also include/linux/suspend.h)        */
+
+#define get_toi_state()  (toi_state)
+#define restore_toi_state(saved_state) \
+        do { toi_state = saved_state; } while (0)
+
+/*                == Module support ==                */
+
+struct toi_core_fns {
+        int (*post_context_save)(void);
+        unsigned long (*get_nonconflicting_page)(void);
+        int (*try_hibernate)(void);
+        void (*try_resume)(void);
+};
+
+extern struct toi_core_fns *toi_core_fns;
+
+/*                == All else ==                        */
+#define KB(x) ((x) << (PAGE_SHIFT - 10))
+#define MB(x) ((x) >> (20 - PAGE_SHIFT))
+
+extern int toi_start_anything(int toi_or_resume);
+extern void toi_finish_anything(int toi_or_resume);
+
+extern int save_image_part1(void);
+extern int toi_atomic_restore(void);
+
+extern int toi_try_hibernate(void);
+extern void toi_try_resume(void);
+
+extern int __toi_post_context_save(void);
+
+extern unsigned int nr_hibernates;
+extern char alt_resume_param[256];
+
+extern void copyback_post(void);
+extern int toi_hibernate(void);
+extern unsigned long extra_pd1_pages_used;
+
+#define SECTOR_SIZE 512
+
+extern void toi_early_boot_message(int can_erase_image, int default_answer,
+        char *warning_reason, ...);
+
+extern int do_check_can_resume(void);
+extern int do_toi_step(int step);
+extern int toi_launch_userspace_program(char *command, int channel_no,
+                int wait, int debug);
+
+extern char tuxonice_signature[9];
+
+extern int toi_start_other_threads(void);
+extern void toi_stop_other_threads(void);
+
+extern int toi_trace_index;
+#define TOI_TRACE_DEBUG(PFN, DESC, ...) \
+    do { \
+        if (test_action_state(TOI_TRACE_DEBUG_ON)) { \
+            printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \
+        } \
+    } while(0)
+
+#ifdef CONFIG_TOI_KEEP_IMAGE
+#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE))
+#else
+#define toi_keeping_image (0)
+#endif
+
+#ifdef CONFIG_TOI_INCREMENTAL
+extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose);
+extern int toi_reset_dirtiness(int verbose);
+extern void toi_cbw_write(void);
+extern void toi_cbw_restore(void);
+extern int toi_allocate_cbw_data(void);
+extern void toi_free_cbw_data(void);
+extern int toi_cbw_init(void);
+extern void toi_mark_tasks_cbw(void);
+#else
+static inline int toi_reset_dirtiness(int verbose) { return 0; }
+#define toi_cbw_write() do { } while(0)
+#define toi_cbw_restore() do { } while(0)
+#define toi_allocate_cbw_data() do { } while(0)
+#define toi_free_cbw_data() do { } while(0)
+static inline int toi_cbw_init(void) { return 0; }
+#endif
+#endif
diff --git b/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c
new file mode 100644
index 0000000..1d8b1cb
--- /dev/null
+++ b/kernel/power/tuxonice_alloc.c
@@ -0,0 +1,308 @@
+/*
+ * kernel/power/tuxonice_alloc.c
+ *
+ * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include "tuxonice_modules.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice.h"
+
+#define TOI_ALLOC_PATHS 41
+
+static DEFINE_MUTEX(toi_alloc_mutex);
+
+static struct toi_module_ops toi_alloc_ops;
+
+static int toi_fail_num;
+
+static atomic_t toi_alloc_count[TOI_ALLOC_PATHS],
+                toi_free_count[TOI_ALLOC_PATHS],
+                toi_test_count[TOI_ALLOC_PATHS],
+                toi_fail_count[TOI_ALLOC_PATHS];
+static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS];
+static int cur_allocd, max_allocd;
+
+static char *toi_alloc_desc[TOI_ALLOC_PATHS] = {
+        "", /* 0 */
+        "get_io_info_struct",
+        "extent",
+        "extent (loading chain)",
+        "userui channel",
+        "userui arg", /* 5 */
+        "attention list metadata",
+        "extra pagedir memory metadata",
+        "bdev metadata",
+        "extra pagedir memory",
+        "header_locations_read", /* 10 */
+        "bio queue",
+        "prepare_readahead",
+        "i/o buffer",
+        "writer buffer in bio_init",
+        "checksum buffer", /* 15 */
+        "compression buffer",
+        "filewriter signature op",
+        "set resume param alloc1",
+        "set resume param alloc2",
+        "debugging info buffer", /* 20 */
+        "check can resume buffer",
+        "write module config buffer",
+        "read module config buffer",
+        "write image header buffer",
+        "read pageset1 buffer", /* 25 */
+        "get_have_image_data buffer",
+        "checksum page",
+        "worker rw loop",
+        "get nonconflicting page",
+        "ps1 load addresses", /* 30 */
+        "remove swap image",
+        "swap image exists",
+        "swap parse sig location",
+        "sysfs kobj",
+        "swap mark resume attempted buffer", /* 35 */
+        "cluster member",
+        "boot kernel data buffer",
+        "setting swap signature",
+        "block i/o bdev struct",
+        "copy before write", /* 40 */
+};
+
+#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \
+        do { \
+                BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \
+                \
+                if (FAIL_NUM == toi_fail_num) { \
+                        atomic_inc(&toi_test_count[FAIL_NUM]); \
+                        toi_fail_num = 0; \
+                        return FAIL_VAL; \
+                } \
+        } while (0)
+
+static void alloc_update_stats(int fail_num, void *result, int size)
+{
+        if (!result) {
+                atomic_inc(&toi_fail_count[fail_num]);
+                return;
+        }
+
+        atomic_inc(&toi_alloc_count[fail_num]);
+        if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
+                mutex_lock(&toi_alloc_mutex);
+                toi_cur_allocd[fail_num]++;
+                cur_allocd += size;
+                if (unlikely(cur_allocd > max_allocd)) {
+                        int i;
+
+                        for (i = 0; i < TOI_ALLOC_PATHS; i++)
+                                toi_max_allocd[i] = toi_cur_allocd[i];
+                        max_allocd = cur_allocd;
+                }
+                mutex_unlock(&toi_alloc_mutex);
+        }
+}
+
+static void free_update_stats(int fail_num, int size)
+{
+        BUG_ON(fail_num >= TOI_ALLOC_PATHS);
+        atomic_inc(&toi_free_count[fail_num]);
+        if (unlikely(atomic_read(&toi_free_count[fail_num]) >
+                                atomic_read(&toi_alloc_count[fail_num])))
+                dump_stack();
+        if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
+                mutex_lock(&toi_alloc_mutex);
+                cur_allocd -= size;
+                toi_cur_allocd[fail_num]--;
+                mutex_unlock(&toi_alloc_mutex);
+        }
+}
+
+void *toi_kzalloc(int fail_num, size_t size, gfp_t flags)
+{
+        void *result;
+
+        if (toi_alloc_ops.enabled)
+                MIGHT_FAIL(fail_num, NULL);
+        result = kzalloc(size, flags);
+        if (toi_alloc_ops.enabled)
+                alloc_update_stats(fail_num, result, size);
+        if (fail_num == toi_trace_allocs)
+                dump_stack();
+        return result;
+}
+
+unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
+                unsigned int order)
+{
+        unsigned long result;
+
+        mask |= ___GFP_TOI_NOTRACK;
+        if (toi_alloc_ops.enabled)
+                MIGHT_FAIL(fail_num, 0);
+        result = __get_free_pages(mask, order);
+        if (toi_alloc_ops.enabled)
+                alloc_update_stats(fail_num, (void *) result,
+                                PAGE_SIZE << order);
+        if (fail_num == toi_trace_allocs)
+                dump_stack();
+        return result;
+}
+
+struct page *toi_alloc_page(int fail_num, gfp_t mask)
+{
+        struct page *result;
+
+        if (toi_alloc_ops.enabled)
+                MIGHT_FAIL(fail_num, NULL);
+        mask |= ___GFP_TOI_NOTRACK;
+        result = alloc_page(mask);
+        if (toi_alloc_ops.enabled)
+                alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
+        if (fail_num == toi_trace_allocs)
+                dump_stack();
+        return result;
+}
+
+unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask)
+{
+        unsigned long result;
+
+        if (toi_alloc_ops.enabled)
+                MIGHT_FAIL(fail_num, 0);
+        mask |= ___GFP_TOI_NOTRACK;
+        result = get_zeroed_page(mask);
+        if (toi_alloc_ops.enabled)
+                alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
+        if (fail_num == toi_trace_allocs)
+                dump_stack();
+        return result;
+}
+
+void toi_kfree(int fail_num, const void *arg, int size)
+{
+        if (arg && toi_alloc_ops.enabled)
+                free_update_stats(fail_num, size);
+
+        if (fail_num == toi_trace_allocs)
+                dump_stack();
+        kfree(arg);
+}
+
+void toi_free_page(int fail_num, unsigned long virt)
+{
+        if (virt && toi_alloc_ops.enabled)
+                free_update_stats(fail_num, PAGE_SIZE);
+
+        if (fail_num == toi_trace_allocs)
+                dump_stack();
+        free_page(virt);
+}
+
+void toi__free_page(int fail_num, struct page *page)
+{
+        if (page && toi_alloc_ops.enabled)
+                free_update_stats(fail_num, PAGE_SIZE);
+
+        if (fail_num == toi_trace_allocs)
+                dump_stack();
+        __free_page(page);
+}
+
+void toi_free_pages(int fail_num, struct page *page, int order)
+{
+        if (page && toi_alloc_ops.enabled)
+                free_update_stats(fail_num, PAGE_SIZE << order);
+
+        if (fail_num == toi_trace_allocs)
+                dump_stack();
+        __free_pages(page, order);
+}
+
+void toi_alloc_print_debug_stats(void)
+{
+        int i, header_done = 0;
+
+        if (!toi_alloc_ops.enabled)
+                return;
+
+        for (i = 0; i < TOI_ALLOC_PATHS; i++)
+                if (atomic_read(&toi_alloc_count[i]) !=
+                    atomic_read(&toi_free_count[i])) {
+                        if (!header_done) {
+                                printk(KERN_INFO "Idx  Allocs   Frees   Tests "
+                                        "  Fails     Max Description\n");
+                                header_done = 1;
+                        }
+
+                        printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i,
+                                atomic_read(&toi_alloc_count[i]),
+                                atomic_read(&toi_free_count[i]),
+                                atomic_read(&toi_test_count[i]),
+                                atomic_read(&toi_fail_count[i]),
+                                toi_max_allocd[i],
+                                toi_alloc_desc[i]);
+                }
+}
+
+static int toi_alloc_initialise(int starting_cycle)
+{
+        int i;
+
+        if (!starting_cycle)
+                return 0;
+
+        if (toi_trace_allocs)
+                dump_stack();
+
+        for (i = 0; i < TOI_ALLOC_PATHS; i++) {
+                atomic_set(&toi_alloc_count[i], 0);
+                atomic_set(&toi_free_count[i], 0);
+                atomic_set(&toi_test_count[i], 0);
+                atomic_set(&toi_fail_count[i], 0);
+                toi_cur_allocd[i] = 0;
+                toi_max_allocd[i] = 0;
+        };
+
+        max_allocd = 0;
+        cur_allocd = 0;
+        return 0;
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL),
+        SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0,
+                        NULL),
+        SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_GET_MAX_MEM_ALLOCD, 0),
+        SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0,
+                        NULL)
+};
+
+static struct toi_module_ops toi_alloc_ops = {
+        .type                                        = MISC_HIDDEN_MODULE,
+        .name                                        = "allocation debugging",
+        .directory                                = "alloc",
+        .module                                        = THIS_MODULE,
+        .early                                        = 1,
+        .initialise                                = toi_alloc_initialise,
+
+        .sysfs_data                = sysfs_params,
+        .num_sysfs_entries        = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+int toi_alloc_init(void)
+{
+        int result = toi_register_module(&toi_alloc_ops);
+        return result;
+}
+
+void toi_alloc_exit(void)
+{
+        toi_unregister_module(&toi_alloc_ops);
+}
diff --git b/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h
new file mode 100644
index 0000000..0cd6b68
--- /dev/null
+++ b/kernel/power/tuxonice_alloc.h
@@ -0,0 +1,54 @@
+/*
+ * kernel/power/tuxonice_alloc.h
+ *
+ * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/slab.h>
+#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN)
+#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN)
+
+#ifdef CONFIG_PM_DEBUG
+extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags);
+extern void toi_kfree(int fail_num, const void *arg, int size);
+
+extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
+                unsigned int order);
+#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0)
+extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask);
+extern void toi_free_page(int fail_num, unsigned long buf);
+extern void toi__free_page(int fail_num, struct page *page);
+extern void toi_free_pages(int fail_num, struct page *page, int order);
+extern struct page *toi_alloc_page(int fail_num, gfp_t mask);
+extern int toi_alloc_init(void);
+extern void toi_alloc_exit(void);
+
+extern void toi_alloc_print_debug_stats(void);
+
+#else /* CONFIG_PM_DEBUG */
+
+#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS))
+#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN))
+
+#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER)
+#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS)
+#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS)
+#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0)
+#define toi__free_page(FAIL, PAGE) __free_page(PAGE)
+#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER)
+#define toi_alloc_page(FAIL, MASK) alloc_page(MASK)
+static inline int toi_alloc_init(void)
+{
+        return 0;
+}
+
+static inline void toi_alloc_exit(void) { }
+
+static inline void toi_alloc_print_debug_stats(void) { }
+
+#endif
+
+extern int toi_trace_allocs;
diff --git b/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c
new file mode 100644
index 0000000..5845217
--- /dev/null
+++ b/kernel/power/tuxonice_atomic_copy.c
@@ -0,0 +1,469 @@
+/*
+ * kernel/power/tuxonice_atomic_copy.c
+ *
+ * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * Routines for doing the atomic save/restore.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <linux/console.h>
+#include <linux/syscore_ops.h>
+#include <linux/ftrace.h>
+#include <asm/suspend.h>
+#include "tuxonice.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_power_off.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_io.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_pageflags.h"
+#include "tuxonice_checksum.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_atomic_copy.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_modules.h"
+
+unsigned long extra_pd1_pages_used;
+
+/**
+ * free_pbe_list - free page backup entries used by the atomic copy code.
+ * @list:        List to free.
+ * @highmem:        Whether the list is in highmem.
+ *
+ * Normally, this function isn't used. If, however, we need to abort before
+ * doing the atomic copy, we use this to free the pbes previously allocated.
+ **/
+static void free_pbe_list(struct pbe **list, int highmem)
+{
+        while (*list) {
+                int i;
+                struct pbe *free_pbe, *next_page = NULL;
+                struct page *page;
+
+                if (highmem) {
+                        page = (struct page *) *list;
+                        free_pbe = (struct pbe *) kmap(page);
+                } else {
+                        page = virt_to_page(*list);
+                        free_pbe = *list;
+                }
+
+                for (i = 0; i < PBES_PER_PAGE; i++) {
+                        if (!free_pbe)
+                                break;
+                        if (highmem)
+                                toi__free_page(29, free_pbe->address);
+                        else
+                                toi_free_page(29,
+                                        (unsigned long) free_pbe->address);
+                        free_pbe = free_pbe->next;
+                }
+
+                if (highmem) {
+                        if (free_pbe)
+                                next_page = free_pbe;
+                        kunmap(page);
+                } else {
+                        if (free_pbe)
+                                next_page = free_pbe;
+                }
+
+                toi__free_page(29, page);
+                *list = (struct pbe *) next_page;
+        };
+}
+
+/**
+ * copyback_post - post atomic-restore actions
+ *
+ * After doing the atomic restore, we have a few more things to do:
+ *        1) We want to retain some values across the restore, so we now copy
+ *        these from the nosave variables to the normal ones.
+ *        2) Set the status flags.
+ *        3) Resume devices.
+ *        4) Tell userui so it can redraw & restore settings.
+ *        5) Reread the page cache.
+ **/
+void copyback_post(void)
+{
+        struct toi_boot_kernel_data *bkd =
+                (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
+
+        if (toi_activate_storage(1))
+                panic("Failed to reactivate our storage.");
+
+        toi_post_atomic_restore_modules(bkd);
+
+        toi_cond_pause(1, "About to reload secondary pagedir.");
+
+        if (read_pageset2(0))
+                panic("Unable to successfully reread the page cache.");
+
+        /*
+         * If the user wants to sleep again after resuming from full-off,
+         * it's most likely to be in order to suspend to ram, so we'll
+         * do this check after loading pageset2, to give them the fastest
+         * wakeup when they are ready to use the computer again.
+         */
+        toi_check_resleep();
+
+        if (test_action_state(TOI_INCREMENTAL_IMAGE))
+            toi_reset_dirtiness(1);
+}
+
+/**
+ * toi_copy_pageset1 - do the atomic copy of pageset1
+ *
+ * Make the atomic copy of pageset1. We can't use copy_page (as we once did)
+ * because we can't be sure what side effects it has. On my old Duron, with
+ * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt
+ * count at resume time 4 instead of 3.
+ *
+ * We don't want to call kmap_atomic unconditionally because it has the side
+ * effect of incrementing the preempt count, which will leave it one too high
+ * post resume (the page containing the preempt count will be copied after
+ * its incremented. This is essentially the same problem.
+ **/
+void toi_copy_pageset1(void)
+{
+        int i;
+        unsigned long source_index, dest_index;
+
+        memory_bm_position_reset(pageset1_map);
+        memory_bm_position_reset(pageset1_copy_map);
+
+        source_index = memory_bm_next_pfn(pageset1_map, 0);
+        dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
+
+        for (i = 0; i < pagedir1.size; i++) {
+                unsigned long *origvirt, *copyvirt;
+                struct page *origpage, *copypage;
+                int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1,
+                    was_present1, was_present2;
+
+                origpage = pfn_to_page(source_index);
+                copypage = pfn_to_page(dest_index);
+
+                origvirt = PageHighMem(origpage) ?
+                        kmap_atomic(origpage) :
+                        page_address(origpage);
+
+                copyvirt = PageHighMem(copypage) ?
+                        kmap_atomic(copypage) :
+                        page_address(copypage);
+
+                was_present1 = kernel_page_present(origpage);
+                if (!was_present1)
+                        kernel_map_pages(origpage, 1, 1);
+
+                was_present2 = kernel_page_present(copypage);
+                if (!was_present2)
+                        kernel_map_pages(copypage, 1, 1);
+
+                while (loop >= 0) {
+                        *(copyvirt + loop) = *(origvirt + loop);
+                        loop--;
+                }
+
+                if (!was_present1)
+                        kernel_map_pages(origpage, 1, 0);
+
+                if (!was_present2)
+                        kernel_map_pages(copypage, 1, 0);
+
+                if (PageHighMem(origpage))
+                        kunmap_atomic(origvirt);
+
+                if (PageHighMem(copypage))
+                        kunmap_atomic(copyvirt);
+
+                source_index = memory_bm_next_pfn(pageset1_map, 0);
+                dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
+        }
+}
+
+/**
+ * __toi_post_context_save - steps after saving the cpu context
+ *
+ * Steps taken after saving the CPU state to make the actual
+ * atomic copy.
+ *
+ * Called from swsusp_save in snapshot.c via toi_post_context_save.
+ **/
+int __toi_post_context_save(void)
+{
+        unsigned long old_ps1_size = pagedir1.size;
+
+        check_checksums();
+
+        free_checksum_pages();
+
+        toi_recalculate_image_contents(1);
+
+        extra_pd1_pages_used = pagedir1.size > old_ps1_size ?
+                pagedir1.size - old_ps1_size : 0;
+
+        if (extra_pd1_pages_used > extra_pd1_pages_allowance) {
+                printk(KERN_INFO "Pageset1 has grown by %lu pages. "
+                        "extra_pages_allowance is currently only %lu.\n",
+                        pagedir1.size - old_ps1_size,
+                        extra_pd1_pages_allowance);
+
+                /*
+                 * Highlevel code will see this, clear the state and
+                 * retry if we haven't already done so twice.
+                 */
+                if (any_to_free(1)) {
+                        set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
+                        return 1;
+                }
+                if (try_allocate_extra_memory()) {
+                        printk(KERN_INFO "Failed to allocate the extra memory"
+                                        " needed. Restarting the process.");
+                        set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
+                        return 1;
+                }
+                printk(KERN_INFO "However it looks like there's enough"
+                        " free ram and storage to handle this, so "
+                        " continuing anyway.");
+                /* 
+                 * What if try_allocate_extra_memory above calls
+                 * toi_allocate_extra_pagedir_memory and it allocs a new
+                 * slab page via toi_kzalloc which should be in ps1? So...
+                 */
+                toi_recalculate_image_contents(1);
+        }
+
+        if (!test_action_state(TOI_TEST_FILTER_SPEED) &&
+            !test_action_state(TOI_TEST_BIO))
+                toi_copy_pageset1();
+
+        return 0;
+}
+
+/**
+ * toi_hibernate - high level code for doing the atomic copy
+ *
+ * High-level code which prepares to do the atomic copy. Loosely based
+ * on the swsusp version, but with the following twists:
+ *        - We set toi_running so the swsusp code uses our code paths.
+ *        - We give better feedback regarding what goes wrong if there is a
+ *          problem.
+ *        - We use an extra function to call the assembly, just in case this code
+ *          is in a module (return address).
+ **/
+int toi_hibernate(void)
+{
+        int error;
+
+        error = toi_lowlevel_builtin();
+
+        if (!error) {
+                struct toi_boot_kernel_data *bkd =
+                        (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
+
+                /*
+                 * The boot kernel's data may be larger (newer version) or
+                 * smaller (older version) than ours. Copy the minimum
+                 * of the two sizes, so that we don't overwrite valid values
+                 * from pre-atomic copy.
+                 */
+
+                memcpy(&toi_bkd, (char *) boot_kernel_data_buffer,
+                        min_t(int, sizeof(struct toi_boot_kernel_data),
+                                bkd->size));
+        }
+
+        return error;
+}
+
+/**
+ * toi_atomic_restore - prepare to do the atomic restore
+ *
+ * Get ready to do the atomic restore. This part gets us into the same
+ * state we are in prior to do calling do_toi_lowlevel while
+ * hibernating: hot-unplugging secondary cpus and freeze processes,
+ * before starting the thread that will do the restore.
+ **/
+int toi_atomic_restore(void)
+{
+        int error;
+
+        toi_prepare_status(DONT_CLEAR_BAR,        "Atomic restore.");
+
+        memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line,
+                strlen(saved_command_line));
+
+        toi_pre_atomic_restore_modules(&toi_bkd);
+
+        if (add_boot_kernel_data_pbe())
+                goto Failed;
+
+        toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
+
+        if (toi_go_atomic(PMSG_QUIESCE, 0))
+                goto Failed;
+
+        /* We'll ignore saved state, but this gets preempt count (etc) right */
+        save_processor_state();
+
+        error = swsusp_arch_resume();
+        /*
+         * Code below is only ever reached in case of failure. Otherwise
+         * execution continues at place where swsusp_arch_suspend was called.
+         *
+         * We don't know whether it's safe to continue (this shouldn't happen),
+         * so lets err on the side of caution.
+         */
+        BUG();
+
+Failed:
+        free_pbe_list(&restore_pblist, 0);
+#ifdef CONFIG_HIGHMEM
+        free_pbe_list(&restore_highmem_pblist, 1);
+#endif
+        return 1;
+}
+
+/**
+ * toi_go_atomic - do the actual atomic copy/restore
+ * @state:           The state to use for dpm_suspend_start & power_down calls.
+ * @suspend_time:  Whether we're suspending or resuming.
+ **/
+int toi_go_atomic(pm_message_t state, int suspend_time)
+{
+  if (suspend_time) {
+    if (platform_begin(1)) {
+      set_abort_result(TOI_PLATFORM_PREP_FAILED);
+      toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
+      return 1;
+    }
+
+    if (dpm_prepare(PMSG_FREEZE)) {
+      set_abort_result(TOI_DPM_PREPARE_FAILED);
+      dpm_complete(PMSG_RECOVER);
+      toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
+      return 1;
+    }
+  }
+
+        suspend_console();
+        pm_restrict_gfp_mask();
+
+  if (suspend_time) {
+    if (dpm_suspend(state)) {
+      set_abort_result(TOI_DPM_SUSPEND_FAILED);
+      toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
+      return 1;
+    }
+  } else {
+    if (dpm_suspend_start(state)) {
+      set_abort_result(TOI_DPM_SUSPEND_FAILED);
+      toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
+      return 1;
+    }
+  }
+
+        /* At this point, dpm_suspend_start() has been called, but *not*
+         * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now.
+         * Otherwise, drivers for some devices (e.g. interrupt controllers)
+         * become desynchronized with the actual state of the hardware
+         * at resume time, and evil weirdness ensues.
+         */
+
+        if (dpm_suspend_end(state)) {
+                set_abort_result(TOI_DEVICE_REFUSED);
+                toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1);
+                return 1;
+        }
+
+        if (suspend_time) {
+                if (platform_pre_snapshot(1))
+                        set_abort_result(TOI_PRE_SNAPSHOT_FAILED);
+        } else {
+                if (platform_pre_restore(1))
+                        set_abort_result(TOI_PRE_RESTORE_FAILED);
+        }
+
+        if (test_result_state(TOI_ABORTED)) {
+                toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1);
+                return 1;
+        }
+
+        if (disable_nonboot_cpus()) {
+            set_abort_result(TOI_CPU_HOTPLUG_FAILED);
+            toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG,
+                    suspend_time, 1);
+            return 1;
+        }
+
+        local_irq_disable();
+
+        if (syscore_suspend()) {
+                set_abort_result(TOI_SYSCORE_REFUSED);
+                toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1);
+                return 1;
+        }
+
+        if (suspend_time && pm_wakeup_pending()) {
+                set_abort_result(TOI_WAKEUP_EVENT);
+                toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1);
+                return 1;
+        }
+        return 0;
+}
+
+/**
+ * toi_end_atomic - post atomic copy/restore routines
+ * @stage:                What step to start at.
+ * @suspend_time:        Whether we're suspending or resuming.
+ * @error:                Whether we're recovering from an error.
+ **/
+void toi_end_atomic(int stage, int suspend_time, int error)
+{
+        pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) :
+                PMSG_RESTORE;
+
+        switch (stage) {
+        case ATOMIC_ALL_STEPS:
+                if (!suspend_time) {
+                        events_check_enabled = false;
+                }
+                platform_leave(1);
+        case ATOMIC_STEP_SYSCORE_RESUME:
+                syscore_resume();
+        case ATOMIC_STEP_IRQS:
+                local_irq_enable();
+        case ATOMIC_STEP_CPU_HOTPLUG:
+                enable_nonboot_cpus();
+        case ATOMIC_STEP_PLATFORM_FINISH:
+                if (!suspend_time && error & 2)
+                        platform_restore_cleanup(1);
+                else 
+                        platform_finish(1);
+                dpm_resume_start(msg);
+        case ATOMIC_STEP_DEVICE_RESUME:
+                if (suspend_time && (error & 2))
+                        platform_recover(1);
+                dpm_resume(msg);
+                if (!toi_in_suspend()) {
+                    dpm_resume_end(PMSG_RECOVER);
+                }
+                if (error || !toi_in_suspend()) {
+                        pm_restore_gfp_mask();
+                }
+                resume_console();
+        case ATOMIC_STEP_DPM_COMPLETE:
+                dpm_complete(msg);
+        case ATOMIC_STEP_PLATFORM_END:
+                platform_end(1);
+
+                toi_prepare_status(DONT_CLEAR_BAR, "Post atomic.");
+        }
+}
diff --git b/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h
new file mode 100644
index 0000000..e2d2b4f
--- /dev/null
+++ b/kernel/power/tuxonice_atomic_copy.h
@@ -0,0 +1,25 @@
+/*
+ * kernel/power/tuxonice_atomic_copy.h
+ *
+ * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * Routines for doing the atomic save/restore.
+ */
+
+enum {
+        ATOMIC_ALL_STEPS,
+        ATOMIC_STEP_SYSCORE_RESUME,
+        ATOMIC_STEP_IRQS,
+        ATOMIC_STEP_CPU_HOTPLUG,
+        ATOMIC_STEP_PLATFORM_FINISH,
+        ATOMIC_STEP_DEVICE_RESUME,
+        ATOMIC_STEP_DPM_COMPLETE,
+        ATOMIC_STEP_PLATFORM_END,
+};
+
+int toi_go_atomic(pm_message_t state, int toi_time);
+void toi_end_atomic(int stage, int toi_time, int error);
+
+extern void platform_recover(int platform_mode);
diff --git b/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h
new file mode 100644
index 0000000..2f717f5
--- /dev/null
+++ b/kernel/power/tuxonice_bio.h
@@ -0,0 +1,80 @@
+/*
+ * kernel/power/tuxonice_bio.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file contains declarations for functions exported from
+ * tuxonice_bio.c, which contains low level io functions.
+ */
+
+#include <linux/buffer_head.h>
+#include "tuxonice_extent.h"
+
+void toi_put_extent_chain(struct hibernate_extent_chain *chain);
+int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
+                unsigned long start, unsigned long end);
+
+struct hibernate_extent_saved_state {
+        int extent_num;
+        struct hibernate_extent *extent_ptr;
+        unsigned long offset;
+};
+
+struct toi_bdev_info {
+        struct toi_bdev_info *next;
+        struct hibernate_extent_chain blocks;
+        struct block_device *bdev;
+        struct toi_module_ops *allocator;
+        int allocator_index;
+        struct hibernate_extent_chain allocations;
+        char name[266]; /* "swap on " or "file " + up to 256 chars */
+
+        /* Saved in header */
+        char uuid[17];
+        dev_t dev_t;
+        int prio;
+        int bmap_shift;
+        int blocks_per_page;
+        unsigned long pages_used;
+        struct hibernate_extent_saved_state saved_state[4];
+};
+
+struct toi_extent_iterate_state {
+        struct toi_bdev_info *current_chain;
+        int num_chains;
+        int saved_chain_number[4];
+        struct toi_bdev_info *saved_chain_ptr[4];
+};
+
+/*
+ * Our exported interface so the swapwriter and filewriter don't
+ * need these functions duplicated.
+ */
+struct toi_bio_ops {
+        int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
+                        struct page *page);
+        int (*register_storage)(struct toi_bdev_info *new);
+        void (*free_storage)(void);
+};
+
+struct toi_allocator_ops {
+        unsigned long (*toi_swap_storage_available) (void);
+};
+
+extern struct toi_bio_ops toi_bio_ops;
+
+extern char *toi_writer_buffer;
+extern int toi_writer_buffer_posn;
+
+struct toi_bio_allocator_ops {
+        int (*register_storage) (void);
+        unsigned long (*storage_available)(void);
+        int (*allocate_storage) (struct toi_bdev_info *, unsigned long);
+        int (*bmap) (struct toi_bdev_info *);
+        void (*free_storage) (struct toi_bdev_info *);
+        unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used);
+};
+
+extern int toi_bio_register_storage(void);
diff --git b/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c
new file mode 100644
index 0000000..11cd37f
--- /dev/null
+++ b/kernel/power/tuxonice_bio_chains.c
@@ -0,0 +1,1121 @@
+/*
+ * kernel/power/tuxonice_bio_devinfo.c
+ *
+ * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ */
+
+#include <linux/mm_types.h>
+#include "tuxonice_bio.h"
+#include "tuxonice_bio_internal.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_ui.h"
+#include "tuxonice.h"
+#include "tuxonice_io.h"
+
+static struct toi_bdev_info *prio_chain_head;
+static int num_chains;
+
+/* Pointer to current entry being loaded/saved. */
+struct toi_extent_iterate_state toi_writer_posn;
+
+#define metadata_size (sizeof(struct toi_bdev_info) - \
+                offsetof(struct toi_bdev_info, uuid))
+
+/*
+ * After section 0 (header) comes 2 => next_section[0] = 2
+ */
+static int next_section[3] = { 2, 3, 1 };
+
+/**
+ * dump_block_chains - print the contents of the bdev info array.
+ **/
+void dump_block_chains(void)
+{
+        int i = 0;
+        int j;
+        struct toi_bdev_info *cur_chain = prio_chain_head;
+
+        while (cur_chain) {
+                struct hibernate_extent *this = cur_chain->blocks.first;
+
+                printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio);
+
+                while (this) {
+                        printk(KERN_CONT " [%lu-%lu]%s", this->start,
+                                        this->end, this->next ? "," : "");
+                        this = this->next;
+                }
+
+                printk("\n");
+                cur_chain = cur_chain->next;
+                i++;
+        }
+
+        printk(KERN_DEBUG "Saved states:\n");
+        for (i = 0; i < 4; i++) {
+                printk(KERN_DEBUG "Slot %d: Chain %d.\n",
+                        i, toi_writer_posn.saved_chain_number[i]);
+
+                cur_chain = prio_chain_head;
+                j = 0;
+                while (cur_chain) {
+                        printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n",
+                                        j, cur_chain->saved_state[i].extent_num,
+                                        cur_chain->saved_state[i].offset);
+                        cur_chain = cur_chain->next;
+                        j++;
+                }
+                printk(KERN_CONT "\n");
+        }
+}
+
+/**
+ *
+ **/
+static void toi_extent_chain_next(void)
+{
+        struct toi_bdev_info *this = toi_writer_posn.current_chain;
+
+        if (!this->blocks.current_extent)
+                return;
+
+        if (this->blocks.current_offset == this->blocks.current_extent->end) {
+                if (this->blocks.current_extent->next) {
+                        this->blocks.current_extent =
+                                this->blocks.current_extent->next;
+                        this->blocks.current_offset =
+                                this->blocks.current_extent->start;
+                } else {
+                        this->blocks.current_extent = NULL;
+                        this->blocks.current_offset = 0;
+                }
+        } else
+                this->blocks.current_offset++;
+}
+
+/**
+ *
+ */
+
+static struct toi_bdev_info *__find_next_chain_same_prio(void)
+{
+        struct toi_bdev_info *start_chain = toi_writer_posn.current_chain;
+        struct toi_bdev_info *this = start_chain;
+        int orig_prio = this->prio;
+
+        do {
+                this = this->next;
+
+                if (!this)
+                        this = prio_chain_head;
+
+                /* Back on original chain? Use it again. */
+                if (this == start_chain)
+                        return start_chain;
+
+        } while (!this->blocks.current_extent || this->prio != orig_prio);
+
+        return this;
+}
+
+static void find_next_chain(void)
+{
+        struct toi_bdev_info *this;
+
+        this = __find_next_chain_same_prio();
+
+        /*
+         * If we didn't get another chain of the same priority that we
+         * can use, look for the next priority.
+         */
+        while (this && !this->blocks.current_extent)
+                this = this->next;
+
+        toi_writer_posn.current_chain = this;
+}
+
+/**
+ * toi_extent_state_next - go to the next extent
+ * @blocks: The number of values to progress.
+ * @stripe_mode: Whether to spread usage across all chains.
+ *
+ * Given a state, progress to the next valid entry. We may begin in an
+ * invalid state, as we do when invoked after extent_state_goto_start below.
+ *
+ * When using compression and expected_compression > 0, we let the image size
+ * be larger than storage, so we can validly run out of data to return.
+ **/
+static unsigned long toi_extent_state_next(int blocks, int current_stream)
+{
+        int i;
+
+        if (!toi_writer_posn.current_chain)
+                return -ENOSPC;
+
+        /* Assume chains always have lengths that are multiples of @blocks */
+        for (i = 0; i < blocks; i++)
+                toi_extent_chain_next();
+
+        /* The header stream is not striped */
+        if (current_stream ||
+            !toi_writer_posn.current_chain->blocks.current_extent)
+                find_next_chain();
+
+        return  toi_writer_posn.current_chain ? 0 : -ENOSPC;
+}
+
+static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this)
+{
+        struct toi_bdev_info **prev_ptr;
+        struct toi_bdev_info *cur;
+
+        /* Loop through the existing chain, finding where to insert it */
+        prev_ptr = &prio_chain_head;
+        cur = prio_chain_head;
+
+        while (cur && cur->prio >= this->prio) {
+                prev_ptr = &cur->next;
+                cur = cur->next;
+        }
+
+        this->next = *prev_ptr;
+        *prev_ptr = this;
+
+        this = prio_chain_head;
+        while (this)
+                this = this->next;
+        num_chains++;
+}
+
+/**
+ * toi_extent_state_goto_start - reinitialize an extent chain iterator
+ * @state:        Iterator to reinitialize
+ **/
+void toi_extent_state_goto_start(void)
+{
+        struct toi_bdev_info *this = prio_chain_head;
+
+        while (this) {
+                toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                        "Setting current extent to %p.", this->blocks.first);
+                this->blocks.current_extent = this->blocks.first;
+                if (this->blocks.current_extent) {
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                        "Setting current offset to %lu.",
+                                        this->blocks.current_extent->start);
+                        this->blocks.current_offset =
+                                this->blocks.current_extent->start;
+                }
+
+                this = this->next;
+        }
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.",
+                        prio_chain_head);
+        toi_writer_posn.current_chain = prio_chain_head;
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start.");
+}
+
+/**
+ * toi_extent_state_save - save state of the iterator
+ * @state:                Current state of the chain
+ * @saved_state:        Iterator to populate
+ *
+ * Given a state and a struct hibernate_extent_state_store, save the current
+ * position in a format that can be used with relocated chains (at
+ * resume time).
+ **/
+void toi_extent_state_save(int slot)
+{
+        struct toi_bdev_info *cur_chain = prio_chain_head;
+        struct hibernate_extent *extent;
+        struct hibernate_extent_saved_state *chain_state;
+        int i = 0;
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.",
+                        slot);
+
+        if (!toi_writer_posn.current_chain) {
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => "
+                                "chain_num = -1.");
+                toi_writer_posn.saved_chain_number[slot] = -1;
+                return;
+        }
+
+        while (cur_chain) {
+                i++;
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) "
+                                "state, slot %d.", i, cur_chain, slot);
+
+                chain_state = &cur_chain->saved_state[slot];
+
+                chain_state->offset = cur_chain->blocks.current_offset;
+
+                if (toi_writer_posn.current_chain == cur_chain) {
+                        toi_writer_posn.saved_chain_number[slot] = i;
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain "
+                                        "we were on => chain_num is %d.", i);
+                }
+
+                if (!cur_chain->blocks.current_extent) {
+                        chain_state->extent_num = 0;
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent "
+                                        "for this chain => extent_num %d is 0.",
+                                        i);
+                        cur_chain = cur_chain->next;
+                        continue;
+                }
+
+                extent = cur_chain->blocks.first;
+                chain_state->extent_num = 1;
+
+                while (extent != cur_chain->blocks.current_extent) {
+                        chain_state->extent_num++;
+                        extent = extent->next;
+                }
+
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i,
+                                chain_state->extent_num);
+
+                cur_chain = cur_chain->next;
+        }
+        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                        "Completed saving extent state slot %d.", slot);
+}
+
+/**
+ * toi_extent_state_restore - restore the position saved by extent_state_save
+ * @state:                State to populate
+ * @saved_state:        Iterator saved to restore
+ **/
+void toi_extent_state_restore(int slot)
+{
+        int i = 0;
+        struct toi_bdev_info *cur_chain = prio_chain_head;
+        struct hibernate_extent_saved_state *chain_state;
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                        "toi_extent_state_restore - slot %d.", slot);
+
+        if (toi_writer_posn.saved_chain_number[slot] == -1) {
+                toi_writer_posn.current_chain = NULL;
+                return;
+        }
+
+        while (cur_chain) {
+                int posn;
+                int j;
+                i++;
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) "
+                                "state, slot %d.", i, cur_chain, slot);
+
+                chain_state = &cur_chain->saved_state[slot];
+
+                posn = chain_state->extent_num;
+
+                cur_chain->blocks.current_extent = cur_chain->blocks.first;
+                cur_chain->blocks.current_offset = chain_state->offset;
+
+                if (i == toi_writer_posn.saved_chain_number[slot]) {
+                        toi_writer_posn.current_chain = cur_chain;
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                        "Found current chain.");
+                }
+
+                for (j = 0; j < 4; j++)
+                        if (i == toi_writer_posn.saved_chain_number[j]) {
+                                toi_writer_posn.saved_chain_ptr[j] = cur_chain;
+                                toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                        "Found saved chain ptr %d (%p) (offset"
+                                        " %d).", j, cur_chain,
+                                        cur_chain->saved_state[j].offset);
+                        }
+
+                if (posn) {
+                        while (--posn)
+                                cur_chain->blocks.current_extent =
+                                        cur_chain->blocks.current_extent->next;
+                } else
+                        cur_chain->blocks.current_extent = NULL;
+
+                cur_chain = cur_chain->next;
+        }
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done.");
+        if (test_action_state(TOI_LOGALL))
+                dump_block_chains();
+}
+
+/*
+ * Storage needed
+ *
+ * Returns amount of space in the image header required
+ * for the chain data. This ignores the links between
+ * pages, which we factor in when allocating the space.
+ */
+int toi_bio_devinfo_storage_needed(void)
+{
+        int result = sizeof(num_chains);
+        struct toi_bdev_info *chain = prio_chain_head;
+
+        while (chain) {
+                result += metadata_size;
+
+                /* Chain size */
+                result += sizeof(int);
+
+                /* Extents */
+                result += (2 * sizeof(unsigned long) *
+                        chain->blocks.num_extents);
+
+                chain = chain->next;
+        }
+
+        result += 4 * sizeof(int);
+        return result;
+}
+
+static unsigned long chain_pages_used(struct toi_bdev_info *chain)
+{
+        struct hibernate_extent *this = chain->blocks.first;
+        struct hibernate_extent_saved_state *state = &chain->saved_state[3];
+        unsigned long size = 0;
+        int extent_idx = 1;
+
+        if (!state->extent_num) {
+                if (!this)
+                        return 0;
+                else
+                        return chain->blocks.size;
+        }
+
+        while (extent_idx < state->extent_num) {
+                size += (this->end - this->start + 1);
+                this = this->next;
+                extent_idx++;
+        }
+
+        /* We didn't use the one we're sitting on, so don't count it */
+        return size + state->offset - this->start;
+}
+
+void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain)
+{
+    unsigned long used = chain_pages_used(chain);
+
+    /* Free the storage */
+    unsigned long first_freed = 0;
+
+    if (chain->allocator->bio_allocator_ops->free_unused_storage)
+        first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used);
+
+    printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed);
+
+    /* Adjust / free the extents. */
+    toi_put_extent_chain_from(&chain->blocks, first_freed);
+
+    {
+        struct hibernate_extent *this = chain->blocks.first;
+        while (this) {
+            printk("Extent %lx-%lx.\n", this->start, this->end);
+            this = this->next;
+        }
+    }
+}
+
+/**
+ * toi_serialise_extent_chain - write a chain in the image
+ * @chain:        Chain to write.
+ **/
+static int toi_serialise_extent_chain(struct toi_bdev_info *chain)
+{
+        struct hibernate_extent *this;
+        int ret;
+        int i = 1;
+
+        chain->pages_used = chain_pages_used(chain);
+
+        if (test_action_state(TOI_LOGALL))
+                dump_block_chains();
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).",
+                        chain->dev_t);
+        /* Device info -  dev_t, prio, bmap_shift, blocks per page, positions */
+        ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
+                        (char *) &chain->uuid, metadata_size);
+        if (ret)
+                return ret;
+
+        /* Num extents */
+        ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
+                        (char *) &chain->blocks.num_extents, sizeof(int));
+        if (ret)
+                return ret;
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
+                        chain->blocks.num_extents);
+
+        this = chain->blocks.first;
+        while (this) {
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i);
+                ret = toiActiveAllocator->rw_header_chunk(WRITE,
+                                &toi_blockwriter_ops,
+                                (char *) this, 2 * sizeof(this->start));
+                if (ret)
+                        return ret;
+                this = this->next;
+                i++;
+        }
+
+        return ret;
+}
+
+int toi_serialise_extent_chains(void)
+{
+        struct toi_bdev_info *this = prio_chain_head;
+        int result;
+
+        /* Write the number of chains */
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)",
+                        num_chains);
+        result = toiActiveAllocator->rw_header_chunk(WRITE,
+                        &toi_blockwriter_ops, (char *) &num_chains,
+                        sizeof(int));
+        if (result)
+                return result;
+
+        /* Then the chains themselves */
+        while (this) {
+                result = toi_serialise_extent_chain(this);
+                if (result)
+                        return result;
+                this = this->next;
+        }
+
+        /*
+         * Finally, the chain we should be on at the start of each
+         * section.
+         */
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers.");
+        result = toiActiveAllocator->rw_header_chunk(WRITE,
+                        &toi_blockwriter_ops,
+                        (char *) &toi_writer_posn.saved_chain_number[0],
+                        4 * sizeof(int));
+
+        return result;
+}
+
+int toi_register_storage_chain(struct toi_bdev_info *new)
+{
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.",
+                        new);
+        toi_insert_chain_in_prio_list(new);
+        return 0;
+}
+
+static void free_bdev_info(struct toi_bdev_info *chain)
+{
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain);
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents.");
+        toi_put_extent_chain(&chain->blocks);
+
+        /*
+         * The allocator may need to do more than just free the chains
+         * (swap_free, for example). Don't call from boot kernel.
+         */
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents.");
+        if (chain->allocator)
+                chain->allocator->bio_allocator_ops->free_storage(chain);
+
+        /*
+         * Dropping out of reading atomic copy? Need to undo
+         * toi_open_by_devnum.
+         */
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev.");
+        if (chain->bdev && !IS_ERR(chain->bdev) &&
+                        chain->bdev != resume_block_device &&
+                        chain->bdev != header_block_device &&
+                        test_toi_state(TOI_TRYING_TO_RESUME))
+                toi_close_bdev(chain->bdev);
+
+        /* Poison */
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct.");
+        toi_kfree(39, chain, sizeof(*chain));
+
+        if (prio_chain_head == chain)
+                prio_chain_head = NULL;
+
+        num_chains--;
+}
+
+void free_all_bdev_info(void)
+{
+        struct toi_bdev_info *this = prio_chain_head;
+
+        while (this) {
+                struct toi_bdev_info *next = this->next;
+                free_bdev_info(this);
+                this = next;
+        }
+
+        memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn));
+        prio_chain_head = NULL;
+}
+
+static void set_up_start_position(void)
+{
+        toi_writer_posn.current_chain = prio_chain_head;
+        go_next_page(0, 0);
+}
+
+/**
+ * toi_load_extent_chain - read back a chain saved in the image
+ * @chain:        Chain to load
+ *
+ * The linked list of extents is reconstructed from the disk. chain will point
+ * to the first entry.
+ **/
+int toi_load_extent_chain(int index, int *num_loaded)
+{
+        struct toi_bdev_info *chain = toi_kzalloc(39,
+                        sizeof(struct toi_bdev_info), GFP_ATOMIC);
+        struct hibernate_extent *this, *last = NULL;
+        int i, ret;
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index);
+        /* Get dev_t, prio, bmap_shift, blocks per page, positions */
+        ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
+                        (char *) &chain->uuid, metadata_size);
+
+        if (ret) {
+                printk(KERN_ERR "Failed to read the size of extent chain.\n");
+                toi_kfree(39, chain, sizeof(*chain));
+                return 1;
+        }
+
+        toi_bkd.pages_used[index] = chain->pages_used;
+
+        ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
+                        (char *) &chain->blocks.num_extents, sizeof(int));
+        if (ret) {
+                printk(KERN_ERR "Failed to read the size of extent chain.\n");
+                toi_kfree(39, chain, sizeof(*chain));
+                return 1;
+        }
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
+                        chain->blocks.num_extents);
+
+        for (i = 0; i < chain->blocks.num_extents; i++) {
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1);
+
+                this = toi_kzalloc(2, sizeof(struct hibernate_extent),
+                                TOI_ATOMIC_GFP);
+                if (!this) {
+                        printk(KERN_INFO "Failed to allocate a new extent.\n");
+                        free_bdev_info(chain);
+                        return -ENOMEM;
+                }
+                this->next = NULL;
+                /* Get the next page */
+                ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
+                                NULL, (char *) this, 2 * sizeof(this->start));
+                if (ret) {
+                        printk(KERN_INFO "Failed to read an extent.\n");
+                        toi_kfree(2, this, sizeof(struct hibernate_extent));
+                        free_bdev_info(chain);
+                        return 1;
+                }
+
+                if (last)
+                        last->next = this;
+                else {
+                        char b1[32], b2[32], b3[32];
+                        /*
+                         * Open the bdev
+                         */
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                "Chain dev_t is %s. Resume dev t is %s. Header"
+                                " bdev_t is %s.\n",
+                                format_dev_t(b1, chain->dev_t),
+                                format_dev_t(b2, resume_dev_t),
+                                format_dev_t(b3, toi_sig_data->header_dev_t));
+
+                        if (chain->dev_t == resume_dev_t)
+                                chain->bdev = resume_block_device;
+                        else if (chain->dev_t == toi_sig_data->header_dev_t)
+                                chain->bdev = header_block_device;
+                        else {
+                                chain->bdev = toi_open_bdev(chain->uuid,
+                                                chain->dev_t, 1);
+                                if (IS_ERR(chain->bdev)) {
+                                        free_bdev_info(chain);
+                                        return -ENODEV;
+                                }
+                        }
+
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift "
+                                        "is %d and blocks per page is %d.",
+                                        chain->bmap_shift,
+                                        chain->blocks_per_page);
+
+                        chain->blocks.first = this;
+
+                        /*
+                         * Couldn't do this earlier, but can't do
+                         * goto_start now - we may have already used blocks
+                         * in the first chain.
+                         */
+                        chain->blocks.current_extent = this;
+                        chain->blocks.current_offset = this->start;
+
+                        /*
+                         * Can't wait until we've read the whole chain
+                         * before we insert it in the list. We might need
+                         * this chain to read the next page in the header
+                         */
+                        toi_insert_chain_in_prio_list(chain);
+                }
+
+                /*
+                 * We have to wait until 2 extents are loaded before setting up
+                 * properly because if the first extent has only one page, we
+                 * will need to put the position on the second extent. Sounds
+                 * obvious, but it wasn't!
+                 */
+                (*num_loaded)++;
+                if ((*num_loaded) == 2)
+                        set_up_start_position();
+                last = this;
+        }
+
+        /*
+         * Shouldn't get empty chains, but it's not impossible. Link them in so
+         * they get freed properly later.
+         */
+        if (!chain->blocks.num_extents)
+                toi_insert_chain_in_prio_list(chain);
+
+        if (!chain->blocks.current_extent) {
+                chain->blocks.current_extent = chain->blocks.first;
+                if (chain->blocks.current_extent)
+                        chain->blocks.current_offset =
+                                chain->blocks.current_extent->start;
+        }
+        return 0;
+}
+
+int toi_load_extent_chains(void)
+{
+        int result;
+        int to_load;
+        int i;
+        int extents_loaded = 0;
+
+        result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
+                        (char *) &to_load,
+                        sizeof(int));
+        if (result)
+                return result;
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load);
+
+        for (i = 0; i < to_load; i++) {
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.",
+                                i, to_load);
+                result = toi_load_extent_chain(i, &extents_loaded);
+                if (result)
+                        return result;
+        }
+
+        /* If we never got to a second extent, we still need to do this. */
+        if (extents_loaded == 1)
+                set_up_start_position();
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers.");
+        result = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
+                        &toi_blockwriter_ops,
+                        (char *) &toi_writer_posn.saved_chain_number[0],
+                        4 * sizeof(int));
+
+        return result;
+}
+
+static int toi_end_of_stream(int writing, int section_barrier)
+{
+        struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
+        int compare_to = next_section[current_stream];
+        struct toi_bdev_info *compare_chain =
+                toi_writer_posn.saved_chain_ptr[compare_to];
+        int compare_offset = compare_chain ?
+                compare_chain->saved_state[compare_to].offset : 0;
+
+        if (!section_barrier)
+                return 0;
+
+        if (!cur_chain)
+                return 1;
+
+        if (cur_chain == compare_chain &&
+            cur_chain->blocks.current_offset == compare_offset) {
+                if (writing) {
+                        if (!current_stream) {
+                                debug_broken_header();
+                                return 1;
+                        }
+                } else {
+                        more_readahead = 0;
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                        "Reached the end of stream %d "
+                                        "(not an error).", current_stream);
+                        return 1;
+                }
+        }
+
+        return 0;
+}
+
+/**
+ * go_next_page - skip blocks to the start of the next page
+ * @writing: Whether we're reading or writing the image.
+ *
+ * Go forward one page.
+ **/
+int go_next_page(int writing, int section_barrier)
+{
+        struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
+        int max = cur_chain ? cur_chain->blocks_per_page : 1;
+
+        /* Nope. Go foward a page - or maybe two. Don't stripe the header,
+         * so that bad fragmentation doesn't put the extent data containing
+         * the location of the second page out of the first header page.
+         */
+        if (toi_extent_state_next(max, current_stream)) {
+                /* Don't complain if readahead falls off the end */
+                if (writing && section_barrier) {
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. "
+                                "Expected compression ratio too optimistic?");
+                        if (test_action_state(TOI_LOGALL))
+                                dump_block_chains();
+                }
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to "
+                                "read/write. (Not necessarily a fatal error.");
+                return -ENOSPC;
+        }
+
+        return 0;
+}
+
+int devices_of_same_priority(struct toi_bdev_info *this)
+{
+        struct toi_bdev_info *check = prio_chain_head;
+        int i = 0;
+
+        while (check) {
+                if (check->prio == this->prio)
+                        i++;
+                check = check->next;
+        }
+
+        return i;
+}
+
+/**
+ * toi_bio_rw_page - do i/o on the next disk page in the image
+ * @writing: Whether reading or writing.
+ * @page: Page to do i/o on.
+ * @is_readahead: Whether we're doing readahead
+ * @free_group: The group used in allocating the page
+ *
+ * Submit a page for reading or writing, possibly readahead.
+ * Pass the group used in allocating the page as well, as it should
+ * be freed on completion of the bio if we're writing the page.
+ **/
+int toi_bio_rw_page(int writing, struct page *page,
+                int is_readahead, int free_group)
+{
+        int result = toi_end_of_stream(writing, 1);
+        struct toi_bdev_info *dev_info = toi_writer_posn.current_chain;
+
+        if (result) {
+                if (writing)
+                        abort_hibernate(TOI_INSUFFICIENT_STORAGE,
+                                "Insufficient storage for your image.");
+                else
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to "
+                                "read/write another page when stream has "
+                                "ended.");
+                return -ENOSPC;
+        }
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                        "%s %lx:%ld",
+                        writing ? "Write" : "Read",
+                        dev_info->dev_t, dev_info->blocks.current_offset);
+
+        result = toi_do_io(writing, dev_info->bdev,
+                dev_info->blocks.current_offset << dev_info->bmap_shift,
+                page, is_readahead, 0, free_group);
+
+        /* Ignore the result here - will check end of stream if come in again */
+        go_next_page(writing, 1);
+
+        if (result)
+                printk(KERN_ERR "toi_do_io returned %d.\n", result);
+        return result;
+}
+
+dev_t get_header_dev_t(void)
+{
+        return prio_chain_head->dev_t;
+}
+
+struct block_device *get_header_bdev(void)
+{
+        return prio_chain_head->bdev;
+}
+
+unsigned long get_headerblock(void)
+{
+        return prio_chain_head->blocks.first->start <<
+                prio_chain_head->bmap_shift;
+}
+
+int get_main_pool_phys_params(void)
+{
+        struct toi_bdev_info *this = prio_chain_head;
+        int result;
+
+        while (this) {
+                result = this->allocator->bio_allocator_ops->bmap(this);
+                if (result)
+                        return result;
+                this = this->next;
+        }
+
+        return 0;
+}
+
+static int apply_header_reservation(void)
+{
+        int i;
+
+        if (!header_pages_reserved) {
+                toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                "No header pages reserved at the moment.");
+                return 0;
+        }
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation.");
+
+        /* Apply header space reservation */
+        toi_extent_state_goto_start();
+
+        for (i = 0; i < header_pages_reserved; i++)
+                if (go_next_page(1, 0))
+                        return -ENOSPC;
+
+        /* The end of header pages will be the start of pageset 2 */
+        toi_extent_state_save(2);
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                        "Finished applying header reservation.");
+        return 0;
+}
+
+int toi_bio_register_storage(void)
+{
+        int result = 0;
+        struct toi_module_ops *this_module;
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
+                "Registering storage.");
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!this_module->enabled ||
+                    this_module->type != BIO_ALLOCATOR_MODULE)
+                        continue;
+                toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                "Registering storage from %s.",
+                                this_module->name);
+                result = this_module->bio_allocator_ops->register_storage();
+                if (result)
+                        break;
+        }
+
+        return result;
+}
+
+void toi_bio_free_unused_storage(void)
+{
+    struct toi_bdev_info *this = prio_chain_head;
+
+    while (this) {
+        toi_bio_free_unused_storage_chain(this);
+        this = this->next;
+    }
+}
+
+int toi_bio_allocate_storage(unsigned long request)
+{
+        struct toi_bdev_info *chain = prio_chain_head;
+        unsigned long to_get = request;
+        unsigned long extra_pages, needed;
+        int no_free = 0;
+
+        if (!chain) {
+            printk("TuxOnIce: No storage was registered.\n");
+            return 0;
+        }
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
+                        "Request is %lu pages.", request);
+        extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
+                               + sizeof(int)), PAGE_SIZE);
+        needed = request + extra_pages + header_pages_reserved;
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu "
+                        "for header => %lu.",
+                        extra_pages, header_pages_reserved, needed);
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.",
+                        raw_pages_allocd);
+
+        to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0;
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get);
+
+        if (!to_get)
+                return apply_header_reservation();
+
+        while (to_get && chain) {
+                int num_group = devices_of_same_priority(chain);
+                int divisor = num_group - no_free;
+                int i;
+                unsigned long portion = DIV_ROUND_UP(to_get, divisor);
+                unsigned long got = 0;
+                unsigned long got_this_round = 0;
+                struct toi_bdev_info *top = chain;
+
+                toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                " Start of loop. To get is %lu. Divisor is %d.",
+                                to_get, divisor);
+                no_free = 0;
+
+                /*
+                 * We're aiming to spread the allocated storage as evenly
+                 * as possible, but we also want to get all the storage we
+                 * can off this priority.
+                 */
+                for (i = 0; i < num_group; i++) {
+                        struct toi_bio_allocator_ops *ops =
+                                chain->allocator->bio_allocator_ops;
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                        " Asking for %lu pages from chain %p.",
+                                        portion, chain);
+                        got = ops->allocate_storage(chain, portion);
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                        " Got %lu pages from allocator %p.",
+                                        got, chain);
+                        if (!got)
+                                no_free++;
+                        got_this_round += got;
+                        chain = chain->next;
+                }
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a "
+                                "total of %lu pages from %d allocators.",
+                                got_this_round, divisor - no_free);
+
+                raw_pages_allocd += got_this_round;
+                to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd :
+                        0;
+
+                /*
+                 * If we got anything from chains of this priority and we
+                 * still have storage to allocate, go over this priority
+                 * again.
+                 */
+                if (got_this_round && to_get)
+                        chain = top;
+                else
+                        no_free = 0;
+        }
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling "
+                        "get_main_pool_phys_params");
+        /* Now let swap allocator bmap the pages */
+        get_main_pool_phys_params();
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header.");
+        return apply_header_reservation();
+}
+
+void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd)
+{
+        int i = 0;
+        struct toi_bdev_info *cur_chain = prio_chain_head;
+
+        while (cur_chain) {
+                cur_chain->pages_used = bkd->pages_used[i];
+                cur_chain = cur_chain->next;
+                i++;
+        }
+}
+
+int toi_bio_chains_debug_info(char *buffer, int size)
+{
+        /* Show what we actually used */
+        struct toi_bdev_info *cur_chain = prio_chain_head;
+        int len = 0;
+
+        while (cur_chain) {
+                len += scnprintf(buffer + len, size - len, "  Used %lu pages "
+                                "from %s.\n", cur_chain->pages_used,
+                                cur_chain->name);
+                cur_chain = cur_chain->next;
+        }
+
+        return len;
+}
+
+void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
+{
+    struct toi_bdev_info *this = toi_writer_posn.current_chain,
+                         *cmp = prio_chain_head;
+
+    ptr->save.chain = 1;
+    while (this != cmp) {
+        ptr->save.chain++;
+        cmp = cmp->next;
+    }
+    ptr->save.block = this->blocks.current_offset;
+
+    /* Save the raw info internally for quicker access when updating pointers */
+    ptr->bdev = this->bdev;
+    ptr->block = this->blocks.current_offset << this->bmap_shift;
+}
+
+void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
+{
+    int i = ptr->save.chain - 1;
+    struct toi_bdev_info *this;
+    struct hibernate_extent *hib;
+
+    /* Find chain by stored index */
+    this = prio_chain_head;
+    while (i) {
+        this = this->next;
+        i--;
+    }
+    toi_writer_posn.current_chain = this;
+
+    /* Restore block */
+    this->blocks.current_offset = ptr->save.block;
+
+    /* Find current offset from block number */
+    hib = this->blocks.first;
+
+    while (hib->start > ptr->save.block) {
+        hib = hib->next;
+    }
+
+    this->blocks.last_touched = this->blocks.current_extent = hib;
+}
diff --git b/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c
new file mode 100644
index 0000000..bc27662
--- /dev/null
+++ b/kernel/power/tuxonice_bio_core.c
@@ -0,0 +1,1937 @@
+/*
+ * kernel/power/tuxonice_bio.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file contains block io functions for TuxOnIce. These are
+ * used by the swapwriter and it is planned that they will also
+ * be used by the NFSwriter.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/syscalls.h>
+#include <linux/suspend.h>
+#include <linux/ctype.h>
+#include <linux/mount.h>
+#include <linux/fs_uuid.h>
+
+#include "tuxonice.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_bio.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_io.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_bio_internal.h"
+
+#define MEMORY_ONLY 1
+#define THROTTLE_WAIT 2
+
+/* #define MEASURE_MUTEX_CONTENTION */
+#ifndef MEASURE_MUTEX_CONTENTION
+#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
+#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
+#else
+unsigned long mutex_times[2][2][NR_CPUS];
+#define my_mutex_lock(index, the_lock) do { \
+        int have_mutex; \
+        have_mutex = mutex_trylock(the_lock); \
+        if (!have_mutex) { \
+                mutex_lock(the_lock); \
+                mutex_times[index][0][smp_processor_id()]++; \
+        } else { \
+                mutex_times[index][1][smp_processor_id()]++; \
+        }
+
+#define my_mutex_unlock(index, the_lock) \
+        mutex_unlock(the_lock); \
+} while (0)
+#endif
+
+static int page_idx, reset_idx;
+
+static int target_outstanding_io = 1024;
+static int max_outstanding_writes, max_outstanding_reads;
+
+static struct page *bio_queue_head, *bio_queue_tail;
+static atomic_t toi_bio_queue_size;
+static DEFINE_SPINLOCK(bio_queue_lock);
+
+static int free_mem_throttle, throughput_throttle;
+int more_readahead = 1;
+static struct page *readahead_list_head, *readahead_list_tail;
+
+static struct page *waiting_on;
+
+static atomic_t toi_io_in_progress, toi_io_done;
+static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
+
+int current_stream;
+/* Not static, so that the allocators can setup and complete
+ * writing the header */
+char *toi_writer_buffer;
+int toi_writer_buffer_posn;
+
+static DEFINE_MUTEX(toi_bio_mutex);
+static DEFINE_MUTEX(toi_bio_readahead_mutex);
+
+static struct task_struct *toi_queue_flusher;
+static int toi_bio_queue_flush_pages(int dedicated_thread);
+
+struct toi_module_ops toi_blockwriter_ops;
+
+struct toi_incremental_image_pointer toi_inc_ptr[2][2];
+
+#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \
+               atomic_read(&toi_bio_queue_size))
+
+unsigned long raw_pages_allocd, header_pages_reserved;
+
+static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
+                int no_readahead);
+
+/**
+ * set_free_mem_throttle - set the point where we pause to avoid oom.
+ *
+ * Initially, this value is zero, but when we first fail to allocate memory,
+ * we set it (plus a buffer) and thereafter throttle i/o once that limit is
+ * reached.
+ **/
+static void set_free_mem_throttle(void)
+{
+        int new_throttle = nr_free_buffer_pages() + 256;
+
+        if (new_throttle > free_mem_throttle)
+                free_mem_throttle = new_throttle;
+}
+
+#define NUM_REASONS 7
+static atomic_t reasons[NUM_REASONS];
+static char *reason_name[NUM_REASONS] = {
+        "readahead not ready",
+        "bio allocation",
+        "synchronous I/O",
+        "toi_bio_get_new_page",
+        "memory low",
+        "readahead buffer allocation",
+        "throughput_throttle",
+};
+
+/* User Specified Parameters. */
+unsigned long resume_firstblock;
+dev_t resume_dev_t;
+struct block_device *resume_block_device;
+static atomic_t resume_bdev_open_count;
+
+struct block_device *header_block_device;
+
+/**
+ * toi_open_bdev: Open a bdev at resume time.
+ *
+ * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
+ * (the user can have resume= pointing at a swap partition/file that isn't
+ * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
+ * header. It will be from a swap partition that was enabled when we hibernated,
+ * but we don't know it's real index until we read that first page.
+ * dev_t: The device major/minor.
+ * display_errs: Whether to try to do this quietly.
+ *
+ * We stored a dev_t in the image header. Open the matching device without
+ * requiring /dev/<whatever> in most cases and record the details needed
+ * to close it later and avoid duplicating work.
+ */
+struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
+                int display_errs)
+{
+        struct block_device *bdev;
+        dev_t device = default_device;
+        char buf[32];
+        int retried = 0;
+
+retry:
+        if (uuid) {
+                struct fs_info seek;
+                strncpy((char *) &seek.uuid, uuid, 16);
+                seek.dev_t = 0;
+                seek.last_mount_size = 0;
+                device = blk_lookup_fs_info(&seek);
+                if (!device) {
+                        device = default_device;
+                        printk(KERN_DEBUG "Unable to resolve uuid. Falling back"
+                                        " to dev_t.\n");
+                } else
+                        printk(KERN_DEBUG "Resolved uuid to device %s.\n",
+                                        format_dev_t(buf, device));
+        }
+
+        if (!device) {
+                printk(KERN_ERR "TuxOnIce attempting to open a "
+                                "blank dev_t!\n");
+                dump_stack();
+                return NULL;
+        }
+        bdev = toi_open_by_devnum(device);
+
+        if (IS_ERR(bdev) || !bdev) {
+                if (!retried) {
+                        retried = 1;
+                        wait_for_device_probe();
+                        goto retry;
+                }
+                if (display_errs)
+                        toi_early_boot_message(1, TOI_CONTINUE_REQ,
+                                "Failed to get access to block device "
+                                "\"%x\" (error %d).\n Maybe you need "
+                                "to run mknod and/or lvmsetup in an "
+                                "initrd/ramfs?", device, bdev);
+                return ERR_PTR(-EINVAL);
+        }
+        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                        "TuxOnIce got bdev %p for dev_t %x.",
+                        bdev, device);
+
+        return bdev;
+}
+
+static void toi_bio_reserve_header_space(unsigned long request)
+{
+        header_pages_reserved = request;
+}
+
+/**
+ * do_bio_wait - wait for some TuxOnIce I/O to complete
+ * @reason: The array index of the reason we're waiting.
+ *
+ * Wait for a particular page of I/O if we're after a particular page.
+ * If we're not after a particular page, wait instead for all in flight
+ * I/O to be completed or for us to have enough free memory to be able
+ * to submit more I/O.
+ *
+ * If we wait, we also update our statistics regarding why we waited.
+ **/
+static void do_bio_wait(int reason)
+{
+        struct page *was_waiting_on = waiting_on;
+
+        /* On SMP, waiting_on can be reset, so we make a copy */
+        if (was_waiting_on) {
+                wait_on_page_locked(was_waiting_on);
+                atomic_inc(&reasons[reason]);
+        } else {
+                atomic_inc(&reasons[reason]);
+
+                wait_event(num_in_progress_wait,
+                        !atomic_read(&toi_io_in_progress) ||
+                        nr_free_buffer_pages() > free_mem_throttle);
+        }
+}
+
+/**
+ * throttle_if_needed - wait for I/O completion if throttle points are reached
+ * @flags: What to check and how to act.
+ *
+ * Check whether we need to wait for some I/O to complete. We always check
+ * whether we have enough memory available, but may also (depending upon
+ * @reason) check if the throughput throttle limit has been reached.
+ **/
+static int throttle_if_needed(int flags)
+{
+        int free_pages = nr_free_buffer_pages();
+
+        /* Getting low on memory and I/O is in progress? */
+        while (unlikely(free_pages < free_mem_throttle) &&
+                        atomic_read(&toi_io_in_progress) &&
+                        !test_result_state(TOI_ABORTED)) {
+                if (!(flags & THROTTLE_WAIT))
+                        return -ENOMEM;
+                do_bio_wait(4);
+                free_pages = nr_free_buffer_pages();
+        }
+
+        while (!(flags & MEMORY_ONLY) && throughput_throttle &&
+                TOTAL_OUTSTANDING_IO >= throughput_throttle &&
+                !test_result_state(TOI_ABORTED)) {
+                int result = toi_bio_queue_flush_pages(0);
+                if (result)
+                        return result;
+                atomic_inc(&reasons[6]);
+                wait_event(num_in_progress_wait,
+                        !atomic_read(&toi_io_in_progress) ||
+                        TOTAL_OUTSTANDING_IO < throughput_throttle);
+        }
+
+        return 0;
+}
+
+/**
+ * update_throughput_throttle - update the raw throughput throttle
+ * @jif_index: The number of times this function has been called.
+ *
+ * This function is called four times per second by the core, and used to limit
+ * the amount of I/O we submit at once, spreading out our waiting through the
+ * whole job and letting userui get an opportunity to do its work.
+ *
+ * We don't start limiting I/O until 1/4s has gone so that we get a
+ * decent sample for our initial limit, and keep updating it because
+ * throughput may vary (on rotating media, eg) with our block number.
+ *
+ * We throttle to 1/10s worth of I/O.
+ **/
+static void update_throughput_throttle(int jif_index)
+{
+        int done = atomic_read(&toi_io_done);
+        throughput_throttle = done * 2 / 5 / jif_index;
+}
+
+/**
+ * toi_finish_all_io - wait for all outstanding i/o to complete
+ *
+ * Flush any queued but unsubmitted I/O and wait for it all to complete.
+ **/
+static int toi_finish_all_io(void)
+{
+        int result = toi_bio_queue_flush_pages(0);
+        toi_bio_queue_flusher_should_finish = 1;
+        wake_up(&toi_io_queue_flusher);
+        wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO);
+        return result;
+}
+
+/**
+ * toi_end_bio - bio completion function.
+ * @bio: bio that has completed.
+ *
+ * Function called by the block driver from interrupt context when I/O is
+ * completed. If we were writing the page, we want to free it and will have
+ * set bio->bi_private to the parameter we should use in telling the page
+ * allocation accounting code what the page was allocated for. If we're
+ * reading the page, it will be in the singly linked list made from
+ * page->private pointers.
+ **/
+static void toi_end_bio(struct bio *bio)
+{
+        struct page *page = bio->bi_io_vec[0].bv_page;
+
+        BUG_ON(bio->bi_error);
+
+        unlock_page(page);
+        bio_put(bio);
+
+        if (waiting_on == page)
+                waiting_on = NULL;
+
+        put_page(page);
+
+        if (bio->bi_private)
+                toi__free_page((int) ((unsigned long) bio->bi_private) , page);
+
+        bio_put(bio);
+
+        atomic_dec(&toi_io_in_progress);
+        atomic_inc(&toi_io_done);
+
+        wake_up(&num_in_progress_wait);
+}
+
+/**
+ * submit - submit BIO request
+ * @writing: READ or WRITE.
+ * @dev: The block device we're using.
+ * @first_block: The first sector we're using.
+ * @page: The page being used for I/O.
+ * @free_group: If writing, the group that was used in allocating the page
+ *         and which will be used in freeing the page from the completion
+ *         routine.
+ *
+ * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
+ * textbook - allocate and initialize the bio. If we're writing, make sure
+ * the page is marked as dirty. Then submit it and carry on."
+ *
+ * If we're just testing the speed of our own code, we fake having done all
+ * the hard work and all toi_end_bio immediately.
+ **/
+static int submit(int writing, struct block_device *dev, sector_t first_block,
+                struct page *page, int free_group)
+{
+        struct bio *bio = NULL;
+        int cur_outstanding_io, result;
+
+        /*
+         * Shouldn't throttle if reading - can deadlock in the single
+         * threaded case as pages are only freed when we use the
+         * readahead.
+         */
+        if (writing) {
+                result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT);
+                if (result)
+                        return result;
+        }
+
+        while (!bio) {
+                bio = bio_alloc(TOI_ATOMIC_GFP, 1);
+                if (!bio) {
+                        set_free_mem_throttle();
+                        do_bio_wait(1);
+                }
+        }
+
+        bio->bi_bdev = dev;
+        bio->bi_iter.bi_sector = first_block;
+        bio->bi_private = (void *) ((unsigned long) free_group);
+        bio->bi_end_io = toi_end_bio;
+        bio_set_flag(bio, BIO_TOI);
+
+        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+                printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n",
+                                (unsigned long long) first_block);
+                bio_put(bio);
+                return -EFAULT;
+        }
+
+        bio_get(bio);
+
+        cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
+        if (writing) {
+                if (cur_outstanding_io > max_outstanding_writes)
+                        max_outstanding_writes = cur_outstanding_io;
+        } else {
+                if (cur_outstanding_io > max_outstanding_reads)
+                        max_outstanding_reads = cur_outstanding_io;
+        }
+
+        /* Still read the header! */
+        if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) {
+                /* Fake having done the hard work */
+                bio->bi_error = 0;
+                toi_end_bio(bio);
+        } else
+                submit_bio(writing | REQ_SYNC, bio);
+
+        return 0;
+}
+
+/**
+ * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
+ *
+ * @writing: Whether reading or writing.
+ * @bdev: The block device which we're using.
+ * @block0: The first sector we're reading or writing.
+ * @page: The page on which I/O is being done.
+ * @readahead_index: If doing readahead, the index (reset this flag when done).
+ * @syncio: Whether the i/o is being done synchronously.
+ *
+ * Prepare and start a read or write operation.
+ *
+ * Note that we always work with our own page. If writing, we might be given a
+ * compression buffer that will immediately be used to start compressing the
+ * next page. For reading, we do readahead and therefore don't know the final
+ * address where the data needs to go.
+ **/
+int toi_do_io(int writing, struct block_device *bdev, long block0,
+        struct page *page, int is_readahead, int syncio, int free_group)
+{
+        page->private = 0;
+
+        /* Do here so we don't race against toi_bio_get_next_page_read */
+        lock_page(page);
+
+        if (is_readahead) {
+                if (readahead_list_head)
+                        readahead_list_tail->private = (unsigned long) page;
+                else
+                        readahead_list_head = page;
+
+                readahead_list_tail = page;
+        }
+
+        /* Done before submitting to avoid races. */
+        if (syncio)
+                waiting_on = page;
+
+        /* Submit the page */
+        get_page(page);
+
+        if (submit(writing, bdev, block0, page, free_group))
+                return -EFAULT;
+
+        if (syncio)
+                do_bio_wait(2);
+
+        return 0;
+}
+
+/**
+ * toi_bdev_page_io - simpler interface to do directly i/o on a single page
+ * @writing: Whether reading or writing.
+ * @bdev: Block device on which we're operating.
+ * @pos: Sector at which page to read or write starts.
+ * @page: Page to be read/written.
+ *
+ * A simple interface to submit a page of I/O and wait for its completion.
+ * The caller must free the page used.
+ **/
+static int toi_bdev_page_io(int writing, struct block_device *bdev,
+                long pos, struct page *page)
+{
+        return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
+}
+
+/**
+ * toi_bio_memory_needed - report the amount of memory needed for block i/o
+ *
+ * We want to have at least enough memory so as to have target_outstanding_io
+ * or more transactions on the fly at once. If we can do more, fine.
+ **/
+static int toi_bio_memory_needed(void)
+{
+        return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
+                                sizeof(struct bio));
+}
+
+/**
+ * toi_bio_print_debug_stats - put out debugging info in the buffer provided
+ * @buffer: A buffer of size @size into which text should be placed.
+ * @size: The size of @buffer.
+ *
+ * Fill a buffer with debugging info. This is used for both our debug_info sysfs
+ * entry and for recording the same info in dmesg.
+ **/
+static int toi_bio_print_debug_stats(char *buffer, int size)
+{
+        int len = 0;
+
+        if (toiActiveAllocator != &toi_blockwriter_ops) {
+                len = scnprintf(buffer, size,
+                                "- Block I/O inactive.\n");
+                return len;
+        }
+
+        len = scnprintf(buffer, size, "- Block I/O active.\n");
+
+        len += toi_bio_chains_debug_info(buffer + len, size - len);
+
+        len += scnprintf(buffer + len, size - len,
+                        "- Max outstanding reads %d. Max writes %d.\n",
+                        max_outstanding_reads, max_outstanding_writes);
+
+        len += scnprintf(buffer + len, size - len,
+                "  Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
+                target_outstanding_io,
+                PAGE_SIZE, (unsigned int) sizeof(struct request),
+                (unsigned int) sizeof(struct bio), toi_bio_memory_needed());
+
+#ifdef MEASURE_MUTEX_CONTENTION
+        {
+        int i;
+
+        len += scnprintf(buffer + len, size - len,
+                "  Mutex contention while reading:\n  Contended      Free\n");
+
+        for_each_online_cpu(i)
+                len += scnprintf(buffer + len, size - len,
+                "  %9lu %9lu\n",
+                mutex_times[0][0][i], mutex_times[0][1][i]);
+
+        len += scnprintf(buffer + len, size - len,
+                "  Mutex contention while writing:\n  Contended      Free\n");
+
+        for_each_online_cpu(i)
+                len += scnprintf(buffer + len, size - len,
+                "  %9lu %9lu\n",
+                mutex_times[1][0][i], mutex_times[1][1][i]);
+
+        }
+#endif
+
+        return len + scnprintf(buffer + len, size - len,
+                "  Free mem throttle point reached %d.\n", free_mem_throttle);
+}
+
+static int total_header_bytes;
+static int unowned;
+
+void debug_broken_header(void)
+{
+        printk(KERN_DEBUG "Image header too big for size allocated!\n");
+        print_toi_header_storage_for_modules();
+        printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed());
+        printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header));
+        printk(KERN_DEBUG "Total unowned : %d.\n", unowned);
+        printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes,
+                        DIV_ROUND_UP(total_header_bytes, PAGE_SIZE));
+        printk(KERN_DEBUG "Space needed now : %ld.\n",
+                        get_header_storage_needed(0));
+        dump_block_chains();
+        abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small.");
+}
+
+static int toi_bio_update_previous_inc_img_ptr(int stream)
+{
+    int result;
+    char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP);
+    struct page *page;
+    struct toi_incremental_image_pointer *prev, *this;
+
+    prev = &toi_inc_ptr[stream][0];
+    this = &toi_inc_ptr[stream][1];
+
+    if (!buffer) {
+        // We're at the start of writing a pageset. Memory should not be that scarce.
+        return -ENOMEM;
+    }
+
+    page = virt_to_page(buffer);
+    result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0);
+
+    if (result)
+        goto out;
+
+    memcpy(buffer, (char *) this, sizeof(this->save));
+
+    result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12);
+
+    // If the IO is successfully submitted (!result), the page will be freed
+    // asynchronously on completion.
+out:
+    if (result)
+        toi__free_page(12, virt_to_page(buffer));
+    return result;
+}
+
+/**
+ * toi_rw_init_incremental - incremental image part of setting up to write new section
+ */
+static int toi_write_init_incremental(int stream)
+{
+    int result = 0;
+
+    // Remember the location of this block so we can link to it.
+    toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]);
+
+    // Update the pointer at the start of the last pageset with the same stream number.
+    result = toi_bio_update_previous_inc_img_ptr(stream);
+    if (result)
+        return result;
+
+    // Move the current to the previous slot.
+    memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]));
+
+    // Store a blank pointer at the start of this incremental pageset
+    memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1]));
+    result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
+    if (result)
+        return result;
+
+    // Serialise extent chains if this is an incremental pageset
+    return toi_serialise_extent_chains();
+}
+
+/**
+ * toi_read_init_incremental - incremental image part of setting up to read new section
+ */
+static int toi_read_init_incremental(int stream)
+{
+    int result;
+
+    // Set our position to the start of the next pageset
+    toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]);
+
+    // Read the start of the next incremental pageset (if any)
+    result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
+
+    if (!result)
+        result = toi_load_extent_chains();
+
+    return result;
+}
+
+/**
+ * toi_rw_init - prepare to read or write a stream in the image
+ * @writing: Whether reading or writing.
+ * @stream number: Section of the image being processed.
+ *
+ * Prepare to read or write a section ('stream') in the image.
+ **/
+static int toi_rw_init(int writing, int stream_number)
+{
+        if (stream_number)
+                toi_extent_state_restore(stream_number);
+        else
+                toi_extent_state_goto_start();
+
+        if (writing) {
+                reset_idx = 0;
+                if (!current_stream)
+                        page_idx = 0;
+        } else {
+                reset_idx = 1;
+        }
+
+        atomic_set(&toi_io_done, 0);
+        if (!toi_writer_buffer)
+                toi_writer_buffer = (char *) toi_get_zeroed_page(11,
+                                TOI_ATOMIC_GFP);
+        toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
+
+        current_stream = stream_number;
+
+        more_readahead = 1;
+
+        if (test_result_state(TOI_KEPT_IMAGE)) {
+            int result;
+
+            if (writing) {
+                result = toi_write_init_incremental(stream_number);
+            } else {
+                result = toi_read_init_incremental(stream_number);
+            }
+
+            if (result)
+                return result;
+        }
+
+        return toi_writer_buffer ? 0 : -ENOMEM;
+}
+
+/**
+ * toi_bio_queue_write - queue a page for writing
+ * @full_buffer: Pointer to a page to be queued
+ *
+ * Add a page to the queue to be submitted. If we're the queue flusher,
+ * we'll do this once we've dropped toi_bio_mutex, so other threads can
+ * continue to submit I/O while we're on the slow path doing the actual
+ * submission.
+ **/
+static void toi_bio_queue_write(char **full_buffer)
+{
+        struct page *page = virt_to_page(*full_buffer);
+        unsigned long flags;
+
+        *full_buffer = NULL;
+        page->private = 0;
+
+        spin_lock_irqsave(&bio_queue_lock, flags);
+        if (!bio_queue_head)
+                bio_queue_head = page;
+        else
+                bio_queue_tail->private = (unsigned long) page;
+
+        bio_queue_tail = page;
+        atomic_inc(&toi_bio_queue_size);
+
+        spin_unlock_irqrestore(&bio_queue_lock, flags);
+        wake_up(&toi_io_queue_flusher);
+}
+
+/**
+ * toi_rw_cleanup - Cleanup after i/o.
+ * @writing: Whether we were reading or writing.
+ *
+ * Flush all I/O and clean everything up after reading or writing a
+ * section of the image.
+ **/
+static int toi_rw_cleanup(int writing)
+{
+        int i, result = 0;
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup.");
+        if (writing) {
+                if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED))
+                        toi_bio_queue_write(&toi_writer_buffer);
+
+                while (bio_queue_head && !result)
+                        result = toi_bio_queue_flush_pages(0);
+
+                if (result)
+                        return result;
+
+                if (current_stream == 2)
+                        toi_extent_state_save(1);
+                else if (current_stream == 1)
+                        toi_extent_state_save(3);
+        }
+
+        result = toi_finish_all_io();
+
+        while (readahead_list_head) {
+                void *next = (void *) readahead_list_head->private;
+                toi__free_page(12, readahead_list_head);
+                readahead_list_head = next;
+        }
+
+        readahead_list_tail = NULL;
+
+        if (!current_stream)
+                return result;
+
+        for (i = 0; i < NUM_REASONS; i++) {
+                if (!atomic_read(&reasons[i]))
+                        continue;
+                printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n",
+                                reason_name[i], atomic_read(&reasons[i]));
+                atomic_set(&reasons[i], 0);
+        }
+
+        current_stream = 0;
+        return result;
+}
+
+/**
+ * toi_start_one_readahead - start one page of readahead
+ * @dedicated_thread: Is this a thread dedicated to doing readahead?
+ *
+ * Start one new page of readahead. If this is being called by a thread
+ * whose only just is to submit readahead, don't quit because we failed
+ * to allocate a page.
+ **/
+static int toi_start_one_readahead(int dedicated_thread)
+{
+        char *buffer = NULL;
+        int oom = 0, result;
+
+        result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0);
+        if (result) {
+            printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result);
+            return result;
+        }
+
+        mutex_lock(&toi_bio_readahead_mutex);
+
+        while (!buffer) {
+                buffer = (char *) toi_get_zeroed_page(12,
+                                TOI_ATOMIC_GFP);
+                if (!buffer) {
+                        if (oom && !dedicated_thread) {
+                                mutex_unlock(&toi_bio_readahead_mutex);
+                                printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result);
+                                return -ENOMEM;
+                        }
+
+                        oom = 1;
+                        set_free_mem_throttle();
+                        do_bio_wait(5);
+                }
+        }
+
+        result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
+        if (result) {
+            printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result);
+        }
+        if (result == -ENOSPC)
+                toi__free_page(12, virt_to_page(buffer));
+        mutex_unlock(&toi_bio_readahead_mutex);
+        if (result) {
+                if (result == -ENOSPC)
+                        toi_message(TOI_BIO, TOI_VERBOSE, 0,
+                                        "Last readahead page submitted.");
+                else
+                        printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n",
+                                        result);
+        }
+        return result;
+}
+
+/**
+ * toi_start_new_readahead - start new readahead
+ * @dedicated_thread: Are we dedicated to this task?
+ *
+ * Start readahead of image pages.
+ *
+ * We can be called as a thread dedicated to this task (may be helpful on
+ * systems with lots of CPUs), in which case we don't exit until there's no
+ * more readahead.
+ *
+ * If this is not called by a dedicated thread, we top up our queue until
+ * there's no more readahead to submit, we've submitted the number given
+ * in target_outstanding_io or the number in progress exceeds the target
+ * outstanding I/O value.
+ *
+ * No mutex needed because this is only ever called by the first cpu.
+ **/
+static int toi_start_new_readahead(int dedicated_thread)
+{
+        int last_result, num_submitted = 0;
+
+        /* Start a new readahead? */
+        if (!more_readahead)
+                return 0;
+
+        do {
+                last_result = toi_start_one_readahead(dedicated_thread);
+
+                if (last_result) {
+                        if (last_result == -ENOMEM || last_result == -ENOSPC)
+                                return 0;
+
+                        printk(KERN_DEBUG
+                                "Begin read chunk returned %d.\n",
+                                last_result);
+                } else
+                        num_submitted++;
+
+        } while (more_readahead && !last_result &&
+                 (dedicated_thread ||
+                  (num_submitted < target_outstanding_io &&
+                   atomic_read(&toi_io_in_progress) < target_outstanding_io)));
+
+        return last_result;
+}
+
+/**
+ * bio_io_flusher - start the dedicated I/O flushing routine
+ * @writing: Whether we're writing the image.
+ **/
+static int bio_io_flusher(int writing)
+{
+
+        if (writing)
+                return toi_bio_queue_flush_pages(1);
+        else
+                return toi_start_new_readahead(1);
+}
+
+/**
+ * toi_bio_get_next_page_read - read a disk page, perhaps with readahead
+ * @no_readahead: Whether we can use readahead
+ *
+ * Read a page from disk, submitting readahead and cleaning up finished i/o
+ * while we wait for the page we're after.
+ **/
+static int toi_bio_get_next_page_read(int no_readahead)
+{
+        char *virt;
+        struct page *old_readahead_list_head;
+
+        /*
+         * When reading the second page of the header, we have to
+         * delay submitting the read until after we've gotten the
+         * extents out of the first page.
+         */
+        if (unlikely(no_readahead)) {
+            int result = toi_start_one_readahead(0);
+            if (result) {
+                printk(KERN_EMERG "No readahead and toi_start_one_readahead "
+                        "returned non-zero.\n");
+                return -EIO;
+            }
+        }
+
+        if (unlikely(!readahead_list_head)) {
+                /*
+                 * If the last page finishes exactly on the page
+                 * boundary, we will be called one extra time and
+                 * have no data to return. In this case, we should
+                 * not BUG(), like we used to!
+                 */
+                if (!more_readahead) {
+                        printk(KERN_EMERG "No more readahead.\n");
+                        return -ENOSPC;
+                }
+                if (unlikely(toi_start_one_readahead(0))) {
+                        printk(KERN_EMERG "No readahead and "
+                         "toi_start_one_readahead returned non-zero.\n");
+                        return -EIO;
+                }
+        }
+
+        if (PageLocked(readahead_list_head)) {
+                waiting_on = readahead_list_head;
+                do_bio_wait(0);
+        }
+
+        virt = page_address(readahead_list_head);
+        memcpy(toi_writer_buffer, virt, PAGE_SIZE);
+        
+        mutex_lock(&toi_bio_readahead_mutex);
+        old_readahead_list_head = readahead_list_head;
+        readahead_list_head = (struct page *) readahead_list_head->private;
+        mutex_unlock(&toi_bio_readahead_mutex);
+        toi__free_page(12, old_readahead_list_head);
+        return 0;
+}
+
+/**
+ * toi_bio_queue_flush_pages - flush the queue of pages queued for writing
+ * @dedicated_thread: Whether we're a dedicated thread
+ *
+ * Flush the queue of pages ready to be written to disk.
+ *
+ * If we're a dedicated thread, stay in here until told to leave,
+ * sleeping in wait_event.
+ *
+ * The first thread is normally the only one to come in here. Another
+ * thread can enter this routine too, though, via throttle_if_needed.
+ * Since that's the case, we must be careful to only have one thread
+ * doing this work at a time. Otherwise we have a race and could save
+ * pages out of order.
+ *
+ * If an error occurs, free all remaining pages without submitting them
+ * for I/O.
+ **/
+
+int toi_bio_queue_flush_pages(int dedicated_thread)
+{
+        unsigned long flags;
+        int result = 0;
+        static DEFINE_MUTEX(busy);
+
+        if (!mutex_trylock(&busy))
+                return 0;
+
+top:
+        spin_lock_irqsave(&bio_queue_lock, flags);
+        while (bio_queue_head) {
+                struct page *page = bio_queue_head;
+                bio_queue_head = (struct page *) page->private;
+                if (bio_queue_tail == page)
+                        bio_queue_tail = NULL;
+                atomic_dec(&toi_bio_queue_size);
+                spin_unlock_irqrestore(&bio_queue_lock, flags);
+
+                /* Don't generate more error messages if already had one */
+                if (!result)
+                        result = toi_bio_rw_page(WRITE, page, 0, 11);
+                /*
+                 * If writing the page failed, don't drop out.
+                 * Flush the rest of the queue too.
+                 */
+                if (result)
+                        toi__free_page(11 , page);
+                spin_lock_irqsave(&bio_queue_lock, flags);
+        }
+        spin_unlock_irqrestore(&bio_queue_lock, flags);
+
+        if (dedicated_thread) {
+                wait_event(toi_io_queue_flusher, bio_queue_head ||
+                                toi_bio_queue_flusher_should_finish);
+                if (likely(!toi_bio_queue_flusher_should_finish))
+                        goto top;
+                toi_bio_queue_flusher_should_finish = 0;
+        }
+
+        mutex_unlock(&busy);
+        return result;
+}
+
+/**
+ * toi_bio_get_new_page - get a new page for I/O
+ * @full_buffer: Pointer to a page to allocate.
+ **/
+static int toi_bio_get_new_page(char **full_buffer)
+{
+        int result = throttle_if_needed(THROTTLE_WAIT);
+        if (result)
+                return result;
+
+        while (!*full_buffer) {
+                *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
+                if (!*full_buffer) {
+                        set_free_mem_throttle();
+                        do_bio_wait(3);
+                }
+        }
+
+        return 0;
+}
+
+/**
+ * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O
+ * @writing:                Bool - whether writing (or reading).
+ * @buffer:                The start of the buffer to write or fill.
+ * @buffer_size:        The size of the buffer to write or fill.
+ * @no_readahead:        Don't try to start readhead (when getting extents).
+ **/
+static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
+                int no_readahead)
+{
+        int bytes_left = buffer_size, result = 0;
+
+        while (bytes_left) {
+                char *source_start = buffer + buffer_size - bytes_left;
+                char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
+                int capacity = PAGE_SIZE - toi_writer_buffer_posn;
+                char *to = writing ? dest_start : source_start;
+                char *from = writing ? source_start : dest_start;
+
+                if (bytes_left <= capacity) {
+                        memcpy(to, from, bytes_left);
+                        toi_writer_buffer_posn += bytes_left;
+                        return 0;
+                }
+
+                /* Complete this page and start a new one */
+                memcpy(to, from, capacity);
+                bytes_left -= capacity;
+
+                if (!writing) {
+                        /*
+                         * Perform actual I/O:
+                         * read readahead_list_head into toi_writer_buffer
+                         */
+                        int result = toi_bio_get_next_page_read(no_readahead);
+                        if (result && bytes_left) {
+                                printk("toi_bio_get_next_page_read "
+                                        "returned %d. Expecting to read %d bytes.\n", result, bytes_left);
+                                return result;
+                        }
+                } else {
+                        toi_bio_queue_write(&toi_writer_buffer);
+                        result = toi_bio_get_new_page(&toi_writer_buffer);
+                        if (result) {
+                                printk(KERN_ERR "toi_bio_get_new_page returned "
+                                                "%d.\n", result);
+                                return result;
+                        }
+                }
+
+                toi_writer_buffer_posn = 0;
+                toi_cond_pause(0, NULL);
+        }
+
+        return 0;
+}
+
+/**
+ * toi_bio_read_page - read a page of the image
+ * @pfn:                The pfn where the data belongs.
+ * @buffer_page:        The page containing the (possibly compressed) data.
+ * @buf_size:                The number of bytes on @buffer_page used (PAGE_SIZE).
+ *
+ * Read a (possibly compressed) page from the image, into buffer_page,
+ * returning its pfn and the buffer size.
+ **/
+static int toi_bio_read_page(unsigned long *pfn, int buf_type,
+                void *buffer_page, unsigned int *buf_size)
+{
+        int result = 0;
+        int this_idx;
+        char *buffer_virt = TOI_MAP(buf_type, buffer_page);
+
+        /*
+         * Only call start_new_readahead if we don't have a dedicated thread
+         * and we're the queue flusher.
+         */
+        if (current == toi_queue_flusher && more_readahead &&
+                        !test_action_state(TOI_NO_READAHEAD)) {
+                int result2 = toi_start_new_readahead(0);
+                if (result2) {
+                        printk(KERN_DEBUG "Queue flusher and "
+                         "toi_start_one_readahead returned non-zero.\n");
+                        result = -EIO;
+                        goto out;
+                }
+        }
+
+        my_mutex_lock(0, &toi_bio_mutex);
+
+        /*
+         * Structure in the image:
+         *        [destination pfn|page size|page data]
+         * buf_size is PAGE_SIZE
+         * We can validly find there's nothing to read in a multithreaded
+         * situation.
+         */
+        if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) ||
+            toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
+            toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
+            toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
+                result = -ENODATA;
+                goto out_unlock;
+        }
+
+        if (reset_idx) {
+                page_idx = this_idx;
+                reset_idx = 0;
+        } else {
+                page_idx++;
+                if (!this_idx)
+                        result = -ENODATA;
+                else if (page_idx != this_idx)
+                        printk(KERN_ERR "Got page index %d, expected %d.\n",
+                                        this_idx, page_idx);
+        }
+
+out_unlock:
+        my_mutex_unlock(0, &toi_bio_mutex);
+out:
+        TOI_UNMAP(buf_type, buffer_page);
+        return result;
+}
+
+/**
+ * toi_bio_write_page - write a page of the image
+ * @pfn:                The pfn where the data belongs.
+ * @buffer_page:        The page containing the (possibly compressed) data.
+ * @buf_size:        The number of bytes on @buffer_page used.
+ *
+ * Write a (possibly compressed) page to the image from the buffer, together
+ * with it's index and buffer size.
+ **/
+static int toi_bio_write_page(unsigned long pfn, int buf_type,
+                void *buffer_page, unsigned int buf_size)
+{
+        char *buffer_virt;
+        int result = 0, result2 = 0;
+
+        if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
+                return 0;
+
+        my_mutex_lock(1, &toi_bio_mutex);
+
+        if (test_result_state(TOI_ABORTED)) {
+                my_mutex_unlock(1, &toi_bio_mutex);
+                return 0;
+        }
+
+        buffer_virt = TOI_MAP(buf_type, buffer_page);
+        page_idx++;
+
+        /*
+         * Structure in the image:
+         *        [destination pfn|page size|page data]
+         * buf_size is PAGE_SIZE
+         */
+        if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) ||
+            toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
+            toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
+            toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
+                printk(KERN_DEBUG "toi_rw_buffer returned non-zero to "
+                                "toi_bio_write_page.\n");
+                result = -EIO;
+        }
+
+        TOI_UNMAP(buf_type, buffer_page);
+        my_mutex_unlock(1, &toi_bio_mutex);
+
+        if (current == toi_queue_flusher)
+                result2 = toi_bio_queue_flush_pages(0);
+
+        return result ? result : result2;
+}
+
+/**
+ * _toi_rw_header_chunk - read or write a portion of the image header
+ * @writing:                Whether reading or writing.
+ * @owner:                The module for which we're writing.
+ *                        Used for confirming that modules
+ *                        don't use more header space than they asked for.
+ * @buffer:                Address of the data to write.
+ * @buffer_size:        Size of the data buffer.
+ * @no_readahead:        Don't try to start readhead (when getting extents).
+ *
+ * Perform PAGE_SIZE I/O. Start readahead if needed.
+ **/
+static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
+                char *buffer, int buffer_size, int no_readahead)
+{
+        int result = 0;
+
+        if (owner) {
+                owner->header_used += buffer_size;
+                toi_message(TOI_HEADER, TOI_LOW, 1,
+                        "Header: %s : %d bytes (%d/%d) from offset %d.",
+                        owner->name,
+                        buffer_size, owner->header_used,
+                        owner->header_requested,
+                        toi_writer_buffer_posn);
+                if (owner->header_used > owner->header_requested && writing) {
+                        printk(KERN_EMERG "TuxOnIce module %s is using more "
+                                "header space (%u) than it requested (%u).\n",
+                                owner->name,
+                                owner->header_used,
+                                owner->header_requested);
+                        return buffer_size;
+                }
+        } else {
+                unowned += buffer_size;
+                toi_message(TOI_HEADER, TOI_LOW, 1,
+                        "Header: (No owner): %d bytes (%d total so far) from "
+                        "offset %d.", buffer_size, unowned,
+                        toi_writer_buffer_posn);
+        }
+
+        if (!writing && !no_readahead && more_readahead) {
+                result = toi_start_new_readahead(0);
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead "
+                                "returned %d.", result);
+        }
+
+        if (!result) {
+                result = toi_rw_buffer(writing, buffer, buffer_size,
+                                no_readahead);
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned "
+                                "%d.", result);
+        }
+
+        total_header_bytes += buffer_size;
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning "
+                        "%d.", result);
+        return result;
+}
+
+static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
+                char *buffer, int size)
+{
+        return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
+}
+
+static int toi_rw_header_chunk_noreadahead(int writing,
+                struct toi_module_ops *owner, char *buffer, int size)
+{
+        return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
+}
+
+/**
+ * toi_bio_storage_needed - get the amount of storage needed for my fns
+ **/
+static int toi_bio_storage_needed(void)
+{
+        return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed();
+}
+
+/**
+ * toi_bio_save_config_info - save block I/O config to image header
+ * @buf:        PAGE_SIZE'd buffer into which data should be saved.
+ **/
+static int toi_bio_save_config_info(char *buf)
+{
+        int *ints = (int *) buf;
+        ints[0] = target_outstanding_io;
+        return sizeof(int);
+}
+
+/**
+ * toi_bio_load_config_info - restore block I/O config
+ * @buf:        Data to be reloaded.
+ * @size:        Size of the buffer saved.
+ **/
+static void toi_bio_load_config_info(char *buf, int size)
+{
+        int *ints = (int *) buf;
+        target_outstanding_io  = ints[0];
+}
+
+void close_resume_dev_t(int force)
+{
+        if (!resume_block_device)
+                return;
+
+        if (force)
+                atomic_set(&resume_bdev_open_count, 0);
+        else
+                atomic_dec(&resume_bdev_open_count);
+
+        if (!atomic_read(&resume_bdev_open_count)) {
+                toi_close_bdev(resume_block_device);
+                resume_block_device = NULL;
+        }
+}
+
+int open_resume_dev_t(int force, int quiet)
+{
+        if (force) {
+                close_resume_dev_t(1);
+                atomic_set(&resume_bdev_open_count, 1);
+        } else
+                atomic_inc(&resume_bdev_open_count);
+
+        if (resume_block_device)
+                return 0;
+
+        resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0);
+        if (IS_ERR(resume_block_device)) {
+                if (!quiet)
+                        toi_early_boot_message(1, TOI_CONTINUE_REQ,
+                                "Failed to open device %x, where"
+                                " the header should be found.",
+                                resume_dev_t);
+                resume_block_device = NULL;
+                atomic_set(&resume_bdev_open_count, 0);
+                return 1;
+        }
+
+        return 0;
+}
+
+/**
+ * toi_bio_initialise - initialise bio code at start of some action
+ * @starting_cycle:        Whether starting a hibernation cycle, or just reading or
+ *                        writing a sysfs value.
+ **/
+static int toi_bio_initialise(int starting_cycle)
+{
+        int result;
+
+        if (!starting_cycle || !resume_dev_t)
+                return 0;
+
+        max_outstanding_writes = 0;
+        max_outstanding_reads = 0;
+        current_stream = 0;
+        toi_queue_flusher = current;
+#ifdef MEASURE_MUTEX_CONTENTION
+        {
+                int i, j, k;
+
+                for (i = 0; i < 2; i++)
+                        for (j = 0; j < 2; j++)
+                                for_each_online_cpu(k)
+                                        mutex_times[i][j][k] = 0;
+        }
+#endif
+        result = open_resume_dev_t(0, 1);
+
+        if (result)
+                return result;
+
+        result = toi_bio_register_storage();
+
+        if (result)
+            return result;
+
+        return get_signature_page();
+}
+
+static unsigned long raw_to_real(unsigned long raw)
+{
+        unsigned long extra;
+
+        extra = (raw * (sizeof(unsigned long) + sizeof(int)) +
+                (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
+                (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
+
+        return raw > extra ? raw - extra : 0;
+}
+
+static unsigned long toi_bio_storage_available(void)
+{
+        unsigned long sum = 0;
+        struct toi_module_ops *this_module;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!this_module->enabled ||
+                    this_module->type != BIO_ALLOCATOR_MODULE)
+                        continue;
+                toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage "
+                                "available from %s.", this_module->name);
+                sum += this_module->bio_allocator_ops->storage_available();
+        }
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu "
+                        "pages (%d header pages).", sum, header_pages_reserved);
+
+        return sum > header_pages_reserved ?
+                raw_to_real(sum - header_pages_reserved) : 0;
+
+}
+
+static unsigned long toi_bio_storage_allocated(void)
+{
+        return raw_pages_allocd > header_pages_reserved ?
+                raw_to_real(raw_pages_allocd - header_pages_reserved) : 0;
+}
+
+/*
+ * If we have read part of the image, we might have filled  memory with
+ * data that should be zeroed out.
+ */
+static void toi_bio_noresume_reset(void)
+{
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset.");
+        toi_rw_cleanup(READ);
+        free_all_bdev_info();
+}
+
+/**
+ * toi_bio_cleanup - cleanup after some action
+ * @finishing_cycle:        Whether completing a cycle.
+ **/
+static void toi_bio_cleanup(int finishing_cycle)
+{
+        if (!finishing_cycle)
+                return;
+
+        if (toi_writer_buffer) {
+                toi_free_page(11, (unsigned long) toi_writer_buffer);
+                toi_writer_buffer = NULL;
+        }
+
+        forget_signature_page();
+
+        if (header_block_device && toi_sig_data &&
+                        toi_sig_data->header_dev_t != resume_dev_t)
+                toi_close_bdev(header_block_device);
+
+        header_block_device = NULL;
+
+        close_resume_dev_t(0);
+}
+
+static int toi_bio_write_header_init(void)
+{
+        int result;
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init");
+        toi_rw_init(WRITE, 0);
+        toi_writer_buffer_posn = 0;
+
+        /* Info needed to bootstrap goes at the start of the header.
+         * First we save the positions and devinfo, including the number
+         * of header pages. Then we save the structs containing data needed
+         * for reading the header pages back.
+         * Note that even if header pages take more than one page, when we
+         * read back the info, we will have restored the location of the
+         * next header page by the time we go to use it.
+         */
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains.");
+        result = toi_serialise_extent_chains();
+
+        if (result)
+                return result;
+
+        /*
+         * Signature page hasn't been modified at this point. Write it in
+         * the header so we can restore it later.
+         */
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page.");
+        return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops,
+                        (char *) toi_cur_sig_page,
+                        PAGE_SIZE);
+}
+
+static int toi_bio_write_header_cleanup(void)
+{
+        int result = 0;
+
+        if (toi_writer_buffer_posn)
+                toi_bio_queue_write(&toi_writer_buffer);
+
+        result = toi_finish_all_io();
+
+        unowned = 0;
+        total_header_bytes = 0;
+
+        /* Set signature to save we have an image */
+        if (!result)
+                result = toi_bio_mark_have_image();
+
+        return result;
+}
+
+/*
+ * toi_bio_read_header_init()
+ *
+ * Description:
+ * 1. Attempt to read the device specified with resume=.
+ * 2. Check the contents of the swap header for our signature.
+ * 3. Warn, ignore, reset and/or continue as appropriate.
+ * 4. If continuing, read the toi_swap configuration section
+ *    of the header and set up block device info so we can read
+ *    the rest of the header & image.
+ *
+ * Returns:
+ * May not return if user choose to reboot at a warning.
+ * -EINVAL if cannot resume at this time. Booting should continue
+ * normally.
+ */
+
+static int toi_bio_read_header_init(void)
+{
+        int result = 0;
+        char buf[32];
+
+        toi_writer_buffer_posn = 0;
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init");
+
+        if (!toi_sig_data) {
+                printk(KERN_INFO "toi_bio_read_header_init called when we "
+                                "haven't verified there is an image!\n");
+                return -EINVAL;
+        }
+
+        /*
+         * If the header is not on the resume_swap_dev_t, get the resume device
+         * first.
+         */
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.",
+                        toi_sig_data->header_dev_t);
+        if (toi_sig_data->have_uuid) {
+                struct fs_info seek;
+                dev_t device;
+
+                strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16);
+                seek.dev_t = toi_sig_data->header_dev_t;
+                seek.last_mount_size = 0;
+                device = blk_lookup_fs_info(&seek);
+                if (device) {
+                        printk("Using dev_t %s, returned by blk_lookup_fs_info.\n",
+                                        format_dev_t(buf, device));
+                        toi_sig_data->header_dev_t = device;
+                }
+        }
+        if (toi_sig_data->header_dev_t != resume_dev_t) {
+                header_block_device = toi_open_bdev(NULL,
+                                toi_sig_data->header_dev_t, 1);
+
+                if (IS_ERR(header_block_device))
+                        return PTR_ERR(header_block_device);
+        } else
+                header_block_device = resume_block_device;
+
+        if (!toi_writer_buffer)
+                toi_writer_buffer = (char *) toi_get_zeroed_page(11,
+                                TOI_ATOMIC_GFP);
+        more_readahead = 1;
+
+        /*
+         * Read toi_swap configuration.
+         * Headerblock size taken into account already.
+         */
+        result = toi_bio_ops.bdev_page_io(READ, header_block_device,
+                        toi_sig_data->first_header_block,
+                        virt_to_page((unsigned long) toi_writer_buffer));
+        if (result)
+                return result;
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains.");
+        result = toi_load_extent_chains();
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page.");
+        toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
+        if (!toi_orig_sig_page) {
+                printk(KERN_ERR "Failed to allocate memory for the current"
+                        " image signature.\n");
+                return -ENOMEM;
+        }
+
+        return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops,
+                        (char *) toi_orig_sig_page,
+                        PAGE_SIZE);
+}
+
+static int toi_bio_read_header_cleanup(void)
+{
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup.");
+        return toi_rw_cleanup(READ);
+}
+
+/* Works only for digits and letters, but small and fast */
+#define TOLOWER(x) ((x) | 0x20)
+
+/*
+ * UUID must be 32 chars long. It may have dashes, but nothing
+ * else.
+ */
+char *uuid_from_commandline(char *commandline)
+{
+        int low = 0;
+        char *result = NULL, *output, *ptr;
+
+        if (strncmp(commandline, "UUID=", 5))
+                return NULL;
+
+        result = kzalloc(17, GFP_KERNEL);
+        if (!result) {
+                printk("Failed to kzalloc UUID text memory.\n");
+                return NULL;
+        }
+
+        ptr = commandline + 5;
+        output = result;
+
+        while (*ptr && (output - result) < 16) {
+                if (isxdigit(*ptr)) {
+                        int value = isdigit(*ptr) ? *ptr - '0' :
+                                TOLOWER(*ptr) - 'a' + 10;
+                        if (low) {
+                                *output += value;
+                                output++;
+                        } else {
+                                *output = value << 4;
+                        }
+                        low = !low;
+                } else if (*ptr != '-')
+                        break;
+                ptr++;
+        }
+
+        if ((output - result) < 16 || *ptr) {
+                printk(KERN_DEBUG "Found resume=UUID=, but the value looks "
+                                "invalid.\n");
+                kfree(result);
+                result = NULL;
+        }
+
+        return result;
+}
+
+#define retry_if_fails(command) \
+do { \
+        command; \
+        if (!resume_dev_t && !waited_for_device_probe) { \
+                wait_for_device_probe(); \
+                command; \
+                waited_for_device_probe = 1; \
+        } \
+} while(0)
+
+/**
+ * try_to_open_resume_device: Try to parse and open resume=
+ *
+ * Any "swap:" has been stripped away and we just have the path to deal with.
+ * We attempt to do name_to_dev_t, open and stat the file. Having opened the
+ * file, get the struct block_device * to match.
+ */
+static int try_to_open_resume_device(char *commandline, int quiet)
+{
+        struct kstat stat;
+        int error = 0;
+        char *uuid = uuid_from_commandline(commandline);
+        int waited_for_device_probe = 0;
+
+        resume_dev_t = MKDEV(0, 0);
+
+        if (!strlen(commandline))
+                retry_if_fails(toi_bio_scan_for_image(quiet));
+
+        if (uuid) {
+                struct fs_info seek;
+                strncpy((char *) &seek.uuid, uuid, 16);
+                seek.dev_t = resume_dev_t;
+                seek.last_mount_size = 0;
+                retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek));
+                kfree(uuid);
+        }
+
+        if (!resume_dev_t)
+                retry_if_fails(resume_dev_t = name_to_dev_t(commandline));
+
+        if (!resume_dev_t) {
+                struct file *file = filp_open(commandline,
+                                O_RDONLY|O_LARGEFILE, 0);
+
+                if (!IS_ERR(file) && file) {
+                        vfs_getattr(&file->f_path, &stat);
+                        filp_close(file, NULL);
+                } else
+                        error = vfs_stat(commandline, &stat);
+                if (!error)
+                        resume_dev_t = stat.rdev;
+        }
+
+        if (!resume_dev_t) {
+                if (quiet)
+                        return 1;
+
+                if (test_toi_state(TOI_TRYING_TO_RESUME))
+                        toi_early_boot_message(1, toi_translate_err_default,
+                          "Failed to translate \"%s\" into a device id.\n",
+                          commandline);
+                else
+                        printk("TuxOnIce: Can't translate \"%s\" into a device "
+                                        "id yet.\n", commandline);
+                return 1;
+        }
+
+        return open_resume_dev_t(1, quiet);
+}
+
+/*
+ * Parse Image Location
+ *
+ * Attempt to parse a resume= parameter.
+ * Swap Writer accepts:
+ * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
+ *
+ * Where:
+ * DEVNAME is convertable to a dev_t by name_to_dev_t
+ * FIRSTBLOCK is the location of the first block in the swap file
+ * (specifying for a swap partition is nonsensical but not prohibited).
+ * Data is validated by attempting to read a swap header from the
+ * location given. Failure will result in toi_swap refusing to
+ * save an image, and a reboot with correct parameters will be
+ * necessary.
+ */
+static int toi_bio_parse_sig_location(char *commandline,
+                int only_allocator, int quiet)
+{
+        char *thischar, *devstart, *colon = NULL;
+        int signature_found, result = -EINVAL, temp_result = 0;
+
+        if (strncmp(commandline, "swap:", 5) &&
+            strncmp(commandline, "file:", 5)) {
+                /*
+                 * Failing swap:, we'll take a simple resume=/dev/hda2, or a
+                 * blank value (scan) but fall through to other allocators
+                 * if /dev/ or UUID= isn't matched.
+                 */
+                if (strncmp(commandline, "/dev/", 5) &&
+                    strncmp(commandline, "UUID=", 5) &&
+                    strlen(commandline))
+                        return 1;
+        } else
+                commandline += 5;
+
+        devstart = commandline;
+        thischar = commandline;
+        while ((*thischar != ':') && (*thischar != '@') &&
+                ((thischar - commandline) < 250) && (*thischar))
+                thischar++;
+
+        if (*thischar == ':') {
+                colon = thischar;
+                *colon = 0;
+                thischar++;
+        }
+
+        while ((thischar - commandline) < 250 && *thischar)
+                thischar++;
+
+        if (colon) {
+                unsigned long block;
+                temp_result = kstrtoul(colon + 1, 0, &block);
+                if (!temp_result)
+                        resume_firstblock = (int) block;
+        } else
+                resume_firstblock = 0;
+
+        clear_toi_state(TOI_CAN_HIBERNATE);
+        clear_toi_state(TOI_CAN_RESUME);
+
+        if (!temp_result)
+                temp_result = try_to_open_resume_device(devstart, quiet);
+
+        if (colon)
+                *colon = ':';
+
+        /* No error if we only scanned */
+        if (temp_result)
+                return strlen(commandline) ? -EINVAL : 1;
+
+        signature_found = toi_bio_image_exists(quiet);
+
+        if (signature_found != -1) {
+                result = 0;
+                /*
+                 * TODO: If only file storage, CAN_HIBERNATE should only be
+                 * set if file allocator's target is valid.
+                 */
+                set_toi_state(TOI_CAN_HIBERNATE);
+                set_toi_state(TOI_CAN_RESUME);
+        } else
+                if (!quiet)
+                        printk(KERN_ERR "TuxOnIce: Block I/O: No "
+                                "signature found at %s.\n", devstart);
+
+        return result;
+}
+
+static void toi_bio_release_storage(void)
+{
+        header_pages_reserved = 0;
+        raw_pages_allocd = 0;
+
+        free_all_bdev_info();
+}
+
+/* toi_swap_remove_image
+ *
+ */
+static int toi_bio_remove_image(void)
+{
+        int result;
+
+        toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image.");
+
+        result = toi_bio_restore_original_signature();
+
+        /*
+         * We don't do a sanity check here: we want to restore the swap
+         * whatever version of kernel made the hibernate image.
+         *
+         * We need to write swap, but swap may not be enabled so
+         * we write the device directly
+         *
+         * If we don't have an current_signature_page, we didn't
+         * read an image header, so don't change anything.
+         */
+
+        toi_bio_release_storage();
+
+        return result;
+}
+
+struct toi_bio_ops toi_bio_ops = {
+        .bdev_page_io = toi_bdev_page_io,
+        .register_storage = toi_register_storage_chain,
+        .free_storage = toi_bio_release_storage,
+};
+
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io,
+                        0, 16384, 0, NULL),
+};
+
+struct toi_module_ops toi_blockwriter_ops = {
+        .type                           = WRITER_MODULE,
+        .name                           = "block i/o",
+        .directory                      = "block_io",
+        .module                         = THIS_MODULE,
+        .memory_needed                  = toi_bio_memory_needed,
+        .print_debug_info               = toi_bio_print_debug_stats,
+        .storage_needed                 = toi_bio_storage_needed,
+        .save_config_info               = toi_bio_save_config_info,
+        .load_config_info               = toi_bio_load_config_info,
+        .initialise                     = toi_bio_initialise,
+        .cleanup                        = toi_bio_cleanup,
+        .post_atomic_restore            = toi_bio_chains_post_atomic,
+
+        .rw_init                        = toi_rw_init,
+        .rw_cleanup                     = toi_rw_cleanup,
+        .read_page                      = toi_bio_read_page,
+        .write_page                     = toi_bio_write_page,
+        .rw_header_chunk                = toi_rw_header_chunk,
+        .rw_header_chunk_noreadahead    = toi_rw_header_chunk_noreadahead,
+        .io_flusher                     = bio_io_flusher,
+        .update_throughput_throttle     = update_throughput_throttle,
+        .finish_all_io                  = toi_finish_all_io,
+
+        .noresume_reset                 = toi_bio_noresume_reset,
+        .storage_available              = toi_bio_storage_available,
+        .storage_allocated              = toi_bio_storage_allocated,
+        .reserve_header_space           = toi_bio_reserve_header_space,
+        .allocate_storage               = toi_bio_allocate_storage,
+        .free_unused_storage            = toi_bio_free_unused_storage,
+        .image_exists                   = toi_bio_image_exists,
+        .mark_resume_attempted          = toi_bio_mark_resume_attempted,
+        .write_header_init              = toi_bio_write_header_init,
+        .write_header_cleanup           = toi_bio_write_header_cleanup,
+        .read_header_init               = toi_bio_read_header_init,
+        .read_header_cleanup            = toi_bio_read_header_cleanup,
+        .get_header_version             = toi_bio_get_header_version,
+        .remove_image                   = toi_bio_remove_image,
+        .parse_sig_location             = toi_bio_parse_sig_location,
+
+        .sysfs_data                     = sysfs_params,
+        .num_sysfs_entries              = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+/**
+ * toi_block_io_load - load time routine for block I/O module
+ *
+ * Register block i/o ops and sysfs entries.
+ **/
+static __init int toi_block_io_load(void)
+{
+        return toi_register_module(&toi_blockwriter_ops);
+}
+
+late_initcall(toi_block_io_load);
diff --git b/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h
new file mode 100644
index 0000000..5e1964a
--- /dev/null
+++ b/kernel/power/tuxonice_bio_internal.h
@@ -0,0 +1,101 @@
+/*
+ * kernel/power/tuxonice_bio_internal.h
+ *
+ * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file contains declarations for functions exported from
+ * tuxonice_bio.c, which contains low level io functions.
+ */
+
+/* Extent chains */
+void toi_extent_state_goto_start(void);
+void toi_extent_state_save(int slot);
+int go_next_page(int writing, int section_barrier);
+void toi_extent_state_restore(int slot);
+void free_all_bdev_info(void);
+int devices_of_same_priority(struct toi_bdev_info *this);
+int toi_register_storage_chain(struct toi_bdev_info *new);
+int toi_serialise_extent_chains(void);
+int toi_load_extent_chains(void);
+int toi_bio_rw_page(int writing, struct page *page, int is_readahead,
+                int free_group);
+int toi_bio_restore_original_signature(void);
+int toi_bio_devinfo_storage_needed(void);
+unsigned long get_headerblock(void);
+dev_t get_header_dev_t(void);
+struct block_device *get_header_bdev(void);
+int toi_bio_allocate_storage(unsigned long request);
+void toi_bio_free_unused_storage(void);
+
+/* Signature functions */
+#define HaveImage "HaveImage"
+#define NoImage "TuxOnIce"
+#define sig_size (sizeof(HaveImage))
+
+struct sig_data {
+        char sig[sig_size];
+        int have_image;
+        int resumed_before;
+
+        char have_uuid;
+        char header_uuid[17];
+        dev_t header_dev_t;
+        unsigned long first_header_block;
+
+        /* Repeat the signature to be sure we have a header version */
+        char sig2[sig_size];
+        int header_version;
+};
+
+void forget_signature_page(void);
+int toi_check_for_signature(void);
+int toi_bio_image_exists(int quiet);
+int get_signature_page(void);
+int toi_bio_mark_resume_attempted(int);
+extern char *toi_cur_sig_page;
+extern char *toi_orig_sig_page;
+int toi_bio_mark_have_image(void);
+extern struct sig_data *toi_sig_data;
+extern dev_t resume_dev_t;
+extern struct block_device *resume_block_device;
+extern struct block_device *header_block_device;
+extern unsigned long resume_firstblock;
+
+struct block_device *open_bdev(dev_t device, int display_errs);
+extern int current_stream;
+extern int more_readahead;
+int toi_do_io(int writing, struct block_device *bdev, long block0,
+        struct page *page, int is_readahead, int syncio, int free_group);
+int get_main_pool_phys_params(void);
+
+void toi_close_bdev(struct block_device *bdev);
+struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
+                int display_errs);
+
+extern struct toi_module_ops toi_blockwriter_ops;
+void dump_block_chains(void);
+void debug_broken_header(void);
+extern unsigned long raw_pages_allocd, header_pages_reserved;
+int toi_bio_chains_debug_info(char *buffer, int size);
+void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd);
+int toi_bio_scan_for_image(int quiet);
+int toi_bio_get_header_version(void);
+
+void close_resume_dev_t(int force);
+int open_resume_dev_t(int force, int quiet);
+
+struct toi_incremental_image_pointer_saved_data {
+    unsigned long block;
+    int chain;
+};
+
+struct toi_incremental_image_pointer {
+    struct toi_incremental_image_pointer_saved_data save;
+    struct block_device *bdev;
+    unsigned long block;
+};
+
+void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
+void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
diff --git b/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c
new file mode 100644
index 0000000..f5418f0
--- /dev/null
+++ b/kernel/power/tuxonice_bio_signature.c
@@ -0,0 +1,403 @@
+/*
+ * kernel/power/tuxonice_bio_signature.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ */
+
+#include <linux/fs_uuid.h>
+
+#include "tuxonice.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_bio.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_io.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_bio_internal.h"
+
+struct sig_data *toi_sig_data;
+
+/* Struct of swap header pages */
+
+struct old_sig_data {
+        dev_t device;
+        unsigned long sector;
+        int resume_attempted;
+        int orig_sig_type;
+};
+
+union diskpage {
+        union swap_header swh;        /* swh.magic is the only member used */
+        struct sig_data sig_data;
+        struct old_sig_data old_sig_data;
+};
+
+union p_diskpage {
+        union diskpage *pointer;
+        char *ptr;
+        unsigned long address;
+};
+
+char *toi_cur_sig_page;
+char *toi_orig_sig_page;
+int have_image;
+int have_old_image;
+
+int get_signature_page(void)
+{
+        if (!toi_cur_sig_page) {
+                toi_message(TOI_IO, TOI_VERBOSE, 0,
+                                "Allocating current signature page.");
+                toi_cur_sig_page = (char *) toi_get_zeroed_page(38,
+                        TOI_ATOMIC_GFP);
+                if (!toi_cur_sig_page) {
+                        printk(KERN_ERR "Failed to allocate memory for the "
+                                "current image signature.\n");
+                        return -ENOMEM;
+                }
+
+                toi_sig_data = (struct sig_data *) toi_cur_sig_page;
+        }
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx,"
+                        " sector %d.",
+                        resume_block_device->bd_dev, resume_firstblock);
+
+        return toi_bio_ops.bdev_page_io(READ, resume_block_device,
+                resume_firstblock, virt_to_page(toi_cur_sig_page));
+}
+
+void forget_signature_page(void)
+{
+        if (toi_cur_sig_page) {
+                toi_sig_data = NULL;
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page"
+                                " (%p).", toi_cur_sig_page);
+                toi_free_page(38, (unsigned long) toi_cur_sig_page);
+                toi_cur_sig_page = NULL;
+        }
+
+        if (toi_orig_sig_page) {
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page"
+                                " (%p).", toi_orig_sig_page);
+                toi_free_page(38, (unsigned long) toi_orig_sig_page);
+                toi_orig_sig_page = NULL;
+        }
+}
+
+/*
+ * We need to ensure we use the signature page that's currently on disk,
+ * so as to not remove the image header. Post-atomic-restore, the orig sig
+ * page will be empty, so we can use that as our method of knowing that we
+ * need to load the on-disk signature and not use the non-image sig in
+ * memory. (We're going to powerdown after writing the change, so it's safe.
+ */
+int toi_bio_mark_resume_attempted(int flag)
+{
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.",
+                        flag);
+        if (!toi_orig_sig_page) {
+                forget_signature_page();
+                get_signature_page();
+        }
+        toi_sig_data->resumed_before = flag;
+        return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
+                resume_firstblock, virt_to_page(toi_cur_sig_page));
+}
+
+int toi_bio_mark_have_image(void)
+{
+        int result = 0;
+        char buf[32];
+        struct fs_info *fs_info;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists.");
+        memcpy(toi_sig_data->sig, tuxonice_signature,
+                        sizeof(tuxonice_signature));
+        toi_sig_data->have_image = 1;
+        toi_sig_data->resumed_before = 0;
+        toi_sig_data->header_dev_t = get_header_dev_t();
+        toi_sig_data->have_uuid = 0;
+
+        fs_info = fs_info_from_block_dev(get_header_bdev());
+        if (fs_info && !IS_ERR(fs_info)) {
+                memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16);
+                free_fs_info(fs_info);
+        } else
+                result = (int) PTR_ERR(fs_info);
+
+        if (!result) {
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.",
+                                format_dev_t(buf, get_header_dev_t()));
+                toi_sig_data->have_uuid = 1;
+        } else
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for "
+                                "dev_t %s.",
+                                format_dev_t(buf, get_header_dev_t()));
+
+        toi_sig_data->first_header_block = get_headerblock();
+        have_image = 1;
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block "
+                        "is %d.", toi_sig_data->header_dev_t,
+                        toi_sig_data->first_header_block);
+
+        memcpy(toi_sig_data->sig2, tuxonice_signature,
+                        sizeof(tuxonice_signature));
+        toi_sig_data->header_version = TOI_HEADER_VERSION;
+
+        return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
+                resume_firstblock, virt_to_page(toi_cur_sig_page));
+}
+
+int remove_old_signature(void)
+{
+        union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page;
+        char *orig_sig;
+        char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
+        int result;
+        struct block_device *header_bdev;
+        struct old_sig_data *old_sig_data =
+                &swap_header_page.pointer->old_sig_data;
+
+        header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1);
+        result = toi_bio_ops.bdev_page_io(READ, header_bdev,
+                        old_sig_data->sector, virt_to_page(header_start));
+
+        if (result)
+                goto out;
+
+        /*
+         * TODO: Get the original contents of the first bytes of the swap
+         * header page.
+         */
+        if (!old_sig_data->orig_sig_type)
+                orig_sig = "SWAP-SPACE";
+        else
+                orig_sig = "SWAPSPACE2";
+
+        memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
+        memcpy(swap_header_page.ptr, header_start, 10);
+
+        result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
+                resume_firstblock, virt_to_page(swap_header_page.ptr));
+
+out:
+        toi_close_bdev(header_bdev);
+        have_old_image = 0;
+        toi_free_page(38, (unsigned long) header_start);
+        return result;
+}
+
+/*
+ * toi_bio_restore_original_signature - restore the original signature
+ *
+ * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used.
+ * It will have the original signature page contents, stored in the image
+ * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain
+ * the contents that were loaded when we started the cycle.
+ */
+int toi_bio_restore_original_signature(void)
+{
+        char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page;
+
+        if (have_old_image)
+                return remove_old_signature();
+
+        if (!use) {
+                printk("toi_bio_restore_original_signature: No signature "
+                                "page loaded.\n");
+                return 0;
+        }
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists.");
+        have_image = 0;
+        toi_sig_data->have_image = 0;
+        return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
+                resume_firstblock, virt_to_page(use));
+}
+
+/*
+ * check_for_signature - See whether we have an image.
+ *
+ * Returns 0 if no image, 1 if there is one, -1 if indeterminate.
+ */
+int toi_check_for_signature(void)
+{
+        union p_diskpage swap_header_page;
+        int type;
+        const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" };
+        const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" };
+        char *swap_header;
+
+        if (!toi_cur_sig_page) {
+                int result = get_signature_page();
+
+                if (result)
+                        return result;
+        }
+
+        /*
+         * Start by looking for the binary header.
+         */
+        if (!memcmp(tuxonice_signature, toi_cur_sig_page,
+                                sizeof(tuxonice_signature))) {
+                have_image = toi_sig_data->have_image;
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. "
+                                "Have image is %d.", have_image);
+                if (have_image)
+                        toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is "
+                                        "%x. First block is %d.",
+                                        toi_sig_data->header_dev_t,
+                                        toi_sig_data->first_header_block);
+                return toi_sig_data->have_image;
+        }
+
+        /*
+         * Failing that, try old file allocator headers.
+         */
+
+        if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) {
+                have_image = 1;
+                return 1;
+        }
+
+        have_image = 0;
+
+        if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage)))
+                return 0;
+
+        /*
+         * Nope? How about swap?
+         */
+        swap_header_page = (union p_diskpage) toi_cur_sig_page;
+        swap_header = swap_header_page.pointer->swh.magic.magic;
+
+        /* Normal swapspace? */
+        for (type = 0; type < 2; type++)
+                if (!memcmp(normal_sigs[type], swap_header,
+                                        strlen(normal_sigs[type])))
+                        return 0;
+
+        /* Swsusp or uswsusp? */
+        for (type = 0; type < 3; type++)
+                if (!memcmp(swsusp_sigs[type], swap_header,
+                                        strlen(swsusp_sigs[type])))
+                        return 2;
+
+        /* Old TuxOnIce version? */
+        if (!memcmp(tuxonice_signature, swap_header,
+                                sizeof(tuxonice_signature) - 1)) {
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce "
+                                "signature.");
+                have_old_image = 1;
+                return 3;
+        }
+
+        return -1;
+}
+
+/*
+ * Image_exists
+ *
+ * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
+ */
+int toi_bio_image_exists(int quiet)
+{
+        int result;
+        char *msg = NULL;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists.");
+
+        if (!resume_dev_t) {
+                if (!quiet)
+                        printk(KERN_INFO "Not even trying to read header "
+                                "because resume_dev_t is not set.\n");
+                return -1;
+        }
+
+        if (open_resume_dev_t(0, quiet))
+                return -1;
+
+        result = toi_check_for_signature();
+
+        clear_toi_state(TOI_RESUMED_BEFORE);
+        if (toi_sig_data->resumed_before)
+                set_toi_state(TOI_RESUMED_BEFORE);
+
+        if (quiet || result == -ENOMEM)
+                return result;
+
+        if (result == -1)
+                msg = "TuxOnIce: Unable to find a signature."
+                                " Could you have moved a swap file?\n";
+        else if (!result)
+                msg = "TuxOnIce: No image found.\n";
+        else if (result == 1)
+                msg = "TuxOnIce: Image found.\n";
+        else if (result == 2)
+                msg = "TuxOnIce: uswsusp or swsusp image found.\n";
+        else if (result == 3)
+                msg = "TuxOnIce: Old implementation's signature found.\n";
+
+        printk(KERN_INFO "%s", msg);
+
+        return result;
+}
+
+int toi_bio_scan_for_image(int quiet)
+{
+        struct block_device *bdev;
+        char default_name[255] = "";
+
+        if (!quiet)
+                printk(KERN_DEBUG "Scanning swap devices for TuxOnIce "
+                                "signature...\n");
+        for (bdev = next_bdev_of_type(NULL, "swap"); bdev;
+                                bdev = next_bdev_of_type(bdev, "swap")) {
+                int result;
+                char name[255] = "";
+                sprintf(name, "%u:%u", MAJOR(bdev->bd_dev),
+                                MINOR(bdev->bd_dev));
+                if (!quiet)
+                        printk(KERN_DEBUG "- Trying %s.\n", name);
+                resume_block_device = bdev;
+                resume_dev_t = bdev->bd_dev;
+
+                result = toi_check_for_signature();
+
+                resume_block_device = NULL;
+                resume_dev_t = MKDEV(0, 0);
+
+                if (!default_name[0])
+                        strcpy(default_name, name);
+
+                if (result == 1) {
+                        /* Got one! */
+                        strcpy(resume_file, name);
+                        next_bdev_of_type(bdev, NULL);
+                        if (!quiet)
+                                printk(KERN_DEBUG " ==> Image found on %s.\n",
+                                                resume_file);
+                        return 1;
+                }
+                forget_signature_page();
+        }
+
+        if (!quiet)
+                printk(KERN_DEBUG "TuxOnIce scan: No image found.\n");
+        strcpy(resume_file, default_name);
+        return 0;
+}
+
+int toi_bio_get_header_version(void)
+{
+        return (memcmp(toi_sig_data->sig2, tuxonice_signature,
+                                sizeof(tuxonice_signature))) ?
+                0 : toi_sig_data->header_version;
+
+}
diff --git b/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
new file mode 100644
index 0000000..22bf07a
--- /dev/null
+++ b/kernel/power/tuxonice_builtin.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/kernel.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/bio.h>
+#include <linux/root_dev.h>
+#include <linux/freezer.h>
+#include <linux/reboot.h>
+#include <linux/writeback.h>
+#include <linux/tty.h>
+#include <linux/crypto.h>
+#include <linux/cpu.h>
+#include <linux/ctype.h>
+#include <linux/kthread.h>
+#include "tuxonice_io.h"
+#include "tuxonice.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_netlink.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_pagedir.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_power_off.h"
+#include "tuxonice_alloc.h"
+
+unsigned long toi_bootflags_mask;
+
+/*
+ * Highmem related functions (x86 only).
+ */
+
+#ifdef CONFIG_HIGHMEM
+
+/**
+ * copyback_high: Restore highmem pages.
+ *
+ * Highmem data and pbe lists are/can be stored in highmem.
+ * The format is slightly different to the lowmem pbe lists
+ * used for the assembly code: the last pbe in each page is
+ * a struct page * instead of struct pbe *, pointing to the
+ * next page where pbes are stored (or NULL if happens to be
+ * the end of the list). Since we don't want to generate
+ * unnecessary deltas against swsusp code, we use a cast
+ * instead of a union.
+ **/
+
+static void copyback_high(void)
+{
+        struct page *pbe_page = (struct page *) restore_highmem_pblist;
+        struct pbe *this_pbe, *first_pbe;
+        unsigned long *origpage, *copypage;
+        int pbe_index = 1;
+
+        if (!pbe_page)
+                return;
+
+        this_pbe = (struct pbe *) kmap_atomic(pbe_page);
+        first_pbe = this_pbe;
+
+        while (this_pbe) {
+                int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
+
+                origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address));
+                copypage = kmap_atomic((struct page *) this_pbe->address);
+
+                while (loop >= 0) {
+                        *(origpage + loop) = *(copypage + loop);
+                        loop--;
+                }
+
+                kunmap_atomic(origpage);
+                kunmap_atomic(copypage);
+
+                if (!this_pbe->next)
+                        break;
+
+                if (pbe_index < PBES_PER_PAGE) {
+                        this_pbe++;
+                        pbe_index++;
+                } else {
+                        pbe_page = (struct page *) this_pbe->next;
+                        kunmap_atomic(first_pbe);
+                        if (!pbe_page)
+                                return;
+                        this_pbe = (struct pbe *) kmap_atomic(pbe_page);
+                        first_pbe = this_pbe;
+                        pbe_index = 1;
+                }
+        }
+        kunmap_atomic(first_pbe);
+}
+
+#else /* CONFIG_HIGHMEM */
+static void copyback_high(void) { }
+#endif
+
+char toi_wait_for_keypress_dev_console(int timeout)
+{
+        int fd, this_timeout = 255, orig_kthread = 0;
+        char key = '\0';
+        struct termios t, t_backup;
+
+        /* We should be guaranteed /dev/console exists after populate_rootfs()
+         * in init/main.c.
+         */
+        fd = sys_open("/dev/console", O_RDONLY, 0);
+        if (fd < 0) {
+                printk(KERN_INFO "Couldn't open /dev/console.\n");
+                return key;
+        }
+
+        if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
+                goto out_close;
+
+        memcpy(&t_backup, &t, sizeof(t));
+
+        t.c_lflag &= ~(ISIG|ICANON|ECHO);
+        t.c_cc[VMIN] = 0;
+
+new_timeout:
+        if (timeout > 0) {
+                this_timeout = timeout < 26 ? timeout : 25;
+                timeout -= this_timeout;
+                this_timeout *= 10;
+        }
+
+        t.c_cc[VTIME] = this_timeout;
+
+        if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
+                goto out_restore;
+
+        if (current->flags & PF_KTHREAD) {
+            orig_kthread = (current->flags & PF_KTHREAD);
+            current->flags &= ~PF_KTHREAD;
+        }
+
+        while (1) {
+                if (sys_read(fd, &key, 1) <= 0) {
+                        if (timeout)
+                                goto new_timeout;
+                        key = '\0';
+                        break;
+                }
+                key = tolower(key);
+                if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) {
+                        if (key == 'c') {
+                                set_toi_state(TOI_CONTINUE_REQ);
+                                break;
+                        } else if (key == ' ')
+                                break;
+                } else
+                        break;
+        }
+        if (orig_kthread) {
+            current->flags |= PF_KTHREAD;
+        }
+
+out_restore:
+        sys_ioctl(fd, TCSETS, (long)&t_backup);
+out_close:
+        sys_close(fd);
+
+        return key;
+}
+
+struct toi_boot_kernel_data toi_bkd __nosavedata
+                __attribute__((aligned(PAGE_SIZE))) = {
+        MY_BOOT_KERNEL_DATA_VERSION,
+        0,
+#ifdef CONFIG_TOI_REPLACE_SWSUSP
+        (1 << TOI_REPLACE_SWSUSP) |
+#endif
+        (1 << TOI_NO_FLUSHER_THREAD) |
+        (1 << TOI_PAGESET2_FULL),
+};
+
+struct block_device *toi_open_by_devnum(dev_t dev)
+{
+        struct block_device *bdev = bdget(dev);
+        int err = -ENOMEM;
+        if (bdev)
+                err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
+        return err ? ERR_PTR(err) : bdev;
+}
+
+/**
+ * toi_close_bdev: Close a swap bdev.
+ *
+ * int: The swap entry number to close.
+ */
+void toi_close_bdev(struct block_device *bdev)
+{
+        blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
+}
+
+int toi_wait = CONFIG_TOI_DEFAULT_WAIT;
+struct toi_core_fns *toi_core_fns;
+unsigned long toi_result;
+struct pagedir pagedir1 = {1};
+struct toi_cbw **toi_first_cbw;
+int toi_next_cbw;
+
+unsigned long toi_get_nonconflicting_page(void)
+{
+        return toi_core_fns->get_nonconflicting_page();
+}
+
+int toi_post_context_save(void)
+{
+        return toi_core_fns->post_context_save();
+}
+
+int try_tuxonice_hibernate(void)
+{
+        if (!toi_core_fns)
+                return -ENODEV;
+
+        return toi_core_fns->try_hibernate();
+}
+
+static int num_resume_calls;
+#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL
+static int ignore_late_initcall = 1;
+#else
+static int ignore_late_initcall;
+#endif
+
+int toi_translate_err_default = TOI_CONTINUE_REQ;
+
+void try_tuxonice_resume(void)
+{
+        if (!hibernation_available())
+                return;
+
+        /* Don't let it wrap around eventually */
+        if (num_resume_calls < 2)
+                num_resume_calls++;
+
+        if (num_resume_calls == 1 && ignore_late_initcall) {
+                printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n");
+                return;
+        }
+
+        if (toi_core_fns)
+                toi_core_fns->try_resume();
+        else
+                printk(KERN_INFO "TuxOnIce core not loaded yet.\n");
+}
+
+int toi_lowlevel_builtin(void)
+{
+        int error = 0;
+
+        save_processor_state();
+        error = swsusp_arch_suspend();
+        if (error)
+                printk(KERN_ERR "Error %d hibernating\n", error);
+
+        /* Restore control flow appears here */
+        if (!toi_in_hibernate) {
+                copyback_high();
+                set_toi_state(TOI_NOW_RESUMING);
+        }
+
+        restore_processor_state();
+        return error;
+}
+
+unsigned long toi_compress_bytes_in;
+unsigned long toi_compress_bytes_out;
+
+int toi_in_suspend(void)
+{
+  return in_suspend;
+}
+
+unsigned long toi_state = ((1 << TOI_BOOT_TIME) |
+                (1 << TOI_IGNORE_LOGLEVEL) |
+                (1 << TOI_IO_STOPPED));
+
+/* The number of hibernates we have started (some may have been cancelled) */
+unsigned int nr_hibernates;
+int toi_running;
+__nosavedata int toi_in_hibernate;
+__nosavedata struct pbe *restore_highmem_pblist;
+
+int toi_trace_allocs;
+
+void toi_read_lock_tasklist(void)
+{
+        read_lock(&tasklist_lock);
+}
+
+void toi_read_unlock_tasklist(void)
+{
+        read_unlock(&tasklist_lock);
+}
+
+#ifdef CONFIG_TOI_ZRAM_SUPPORT
+int (*toi_flag_zram_disks) (void);
+
+int toi_do_flag_zram_disks(void)
+{
+        return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0;
+}
+
+#endif
+
+/* toi_generate_free_page_map
+ *
+ * Description:        This routine generates a bitmap of free pages from the
+ *                 lists used by the memory manager. We then use the bitmap
+ *                 to quickly calculate which pages to save and in which
+ *                 pagesets.
+ */
+void toi_generate_free_page_map(void)
+{
+        int order, cpu, t;
+        unsigned long flags, i;
+        struct zone *zone;
+        struct list_head *curr;
+        unsigned long pfn;
+        struct page *page;
+
+        for_each_populated_zone(zone) {
+
+                if (!zone->spanned_pages)
+                        continue;
+
+                spin_lock_irqsave(&zone->lock, flags);
+
+                for (i = 0; i < zone->spanned_pages; i++) {
+                        pfn = zone->zone_start_pfn + i;
+
+                        if (!pfn_valid(pfn))
+                                continue;
+
+                        page = pfn_to_page(pfn);
+
+                        ClearPageNosaveFree(page);
+                }
+
+                for_each_migratetype_order(order, t) {
+                        list_for_each(curr,
+                                        &zone->free_area[order].free_list[t]) {
+                                unsigned long j;
+
+                                pfn = page_to_pfn(list_entry(curr, struct page,
+                                                        lru));
+                                for (j = 0; j < (1UL << order); j++)
+                                        SetPageNosaveFree(pfn_to_page(pfn + j));
+                        }
+                }
+
+                for_each_online_cpu(cpu) {
+                        struct per_cpu_pageset *pset =
+                                per_cpu_ptr(zone->pageset, cpu);
+                        struct per_cpu_pages *pcp = &pset->pcp;
+                        struct page *page;
+                        int t;
+
+                        for (t = 0; t < MIGRATE_PCPTYPES; t++)
+                                list_for_each_entry(page, &pcp->lists[t], lru)
+                                        SetPageNosaveFree(page);
+                }
+
+                spin_unlock_irqrestore(&zone->lock, flags);
+        }
+}
+
+/* toi_size_of_free_region
+ *
+ * Description:        Return the number of pages that are free, beginning with and
+ *                 including this one.
+ */
+int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn)
+{
+        unsigned long this_pfn = start_pfn,
+                      end_pfn = zone_end_pfn(zone);
+
+        while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn)))
+                this_pfn++;
+
+        return this_pfn - start_pfn;
+}
+
+static int __init toi_wait_setup(char *str)
+{
+        int value;
+
+        if (sscanf(str, "=%d", &value)) {
+                if (value < -1 || value > 255)
+                        printk(KERN_INFO "TuxOnIce_wait outside range -1 to "
+                                        "255.\n");
+                else
+                        toi_wait = value;
+        }
+
+        return 1;
+}
+__setup("toi_wait", toi_wait_setup);
+
+static int __init toi_translate_retry_setup(char *str)
+{
+        toi_translate_err_default = 0;
+        return 1;
+}
+__setup("toi_translate_retry", toi_translate_retry_setup);
+
+static int __init toi_debug_setup(char *str)
+{
+        toi_bkd.toi_action |= (1 << TOI_LOGALL);
+        toi_bootflags_mask |= (1 << TOI_LOGALL);
+        toi_bkd.toi_debug_state = 255;
+        toi_bkd.toi_default_console_level = 7;
+        return 1;
+}
+__setup("toi_debug_setup", toi_debug_setup);
+
+static int __init toi_pause_setup(char *str)
+{
+        toi_bkd.toi_action |= (1 << TOI_PAUSE);
+        toi_bootflags_mask |= (1 << TOI_PAUSE);
+        return 1;
+}
+__setup("toi_pause", toi_pause_setup);
+
+#ifdef CONFIG_PM_DEBUG
+static int __init toi_trace_allocs_setup(char *str)
+{
+        int value;
+
+        if (sscanf(str, "=%d", &value))
+                toi_trace_allocs = value;
+
+        return 1;
+}
+__setup("toi_trace_allocs", toi_trace_allocs_setup);
+#endif
+
+static int __init toi_ignore_late_initcall_setup(char *str)
+{
+        int value;
+
+        if (sscanf(str, "=%d", &value))
+                ignore_late_initcall = value;
+
+        return 1;
+}
+__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup);
+
+static int __init toi_force_no_multithreaded_setup(char *str)
+{
+        int value;
+
+        toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO);
+        toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO);
+
+        if (sscanf(str, "=%d", &value) && value)
+                toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO);
+
+        return 1;
+}
+__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup);
+
+#ifdef CONFIG_KGDB
+static int __init toi_post_resume_breakpoint_setup(char *str)
+{
+        int value;
+
+        toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT);
+        toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT);
+        if (sscanf(str, "=%d", &value) && value)
+                toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT);
+
+        return 1;
+}
+__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup);
+#endif
+
+static int __init toi_disable_readahead_setup(char *str)
+{
+        int value;
+
+        toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD);
+        toi_bootflags_mask |= (1 << TOI_NO_READAHEAD);
+        if (sscanf(str, "=%d", &value) && value)
+                toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD);
+
+        return 1;
+}
+__setup("toi_no_readahead", toi_disable_readahead_setup);
diff --git b/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h
new file mode 100644
index 0000000..9539818
--- /dev/null
+++ b/kernel/power/tuxonice_builtin.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+#include <asm/setup.h>
+
+extern struct toi_core_fns *toi_core_fns;
+extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
+extern unsigned int nr_hibernates;
+extern int toi_in_hibernate;
+
+extern __nosavedata struct pbe *restore_highmem_pblist;
+
+int toi_lowlevel_builtin(void);
+
+#ifdef CONFIG_HIGHMEM
+extern __nosavedata struct zone_data *toi_nosave_zone_list;
+extern __nosavedata unsigned long toi_nosave_max_pfn;
+#endif
+
+extern unsigned long toi_get_nonconflicting_page(void);
+extern int toi_post_context_save(void);
+
+extern char toi_wait_for_keypress_dev_console(int timeout);
+extern struct block_device *toi_open_by_devnum(dev_t dev);
+extern void toi_close_bdev(struct block_device *bdev);
+extern int toi_wait;
+extern int toi_translate_err_default;
+extern int toi_force_no_multithreaded;
+extern void toi_read_lock_tasklist(void);
+extern void toi_read_unlock_tasklist(void);
+extern int toi_in_suspend(void);
+extern void toi_generate_free_page_map(void);
+extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn);
+
+#ifdef CONFIG_TOI_ZRAM_SUPPORT
+extern int toi_do_flag_zram_disks(void);
+#else
+#define toi_do_flag_zram_disks() (0)
+#endif
diff --git b/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c
new file mode 100644
index 0000000..1c4e10c
--- /dev/null
+++ b/kernel/power/tuxonice_checksum.c
@@ -0,0 +1,392 @@
+/*
+ * kernel/power/tuxonice_checksum.c
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains data checksum routines for TuxOnIce,
+ * using cryptoapi. They are used to locate any modifications
+ * made to pageset 2 while we're saving it.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_io.h"
+#include "tuxonice_pageflags.h"
+#include "tuxonice_checksum.h"
+#include "tuxonice_pagedir.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_ui.h"
+
+static struct toi_module_ops toi_checksum_ops;
+
+/* Constant at the mo, but I might allow tuning later */
+static char toi_checksum_name[32] = "md4";
+/* Bytes per checksum */
+#define CHECKSUM_SIZE (16)
+
+#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE)
+
+struct cpu_context {
+        struct crypto_hash *transform;
+        struct hash_desc desc;
+        struct scatterlist sg[2];
+        char *buf;
+};
+
+static DEFINE_PER_CPU(struct cpu_context, contexts);
+static int pages_allocated;
+static unsigned long page_list;
+
+static int toi_num_resaved;
+
+static unsigned long this_checksum, next_page;
+static int checksum_count;
+
+static inline int checksum_pages_needed(void)
+{
+        return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE);
+}
+
+/* ---- Local buffer management ---- */
+
+/*
+ * toi_checksum_cleanup
+ *
+ * Frees memory allocated for our labours.
+ */
+static void toi_checksum_cleanup(int ending_cycle)
+{
+        int cpu;
+
+        if (ending_cycle) {
+                for_each_online_cpu(cpu) {
+                        struct cpu_context *this = &per_cpu(contexts, cpu);
+                        if (this->transform) {
+                                crypto_free_hash(this->transform);
+                                this->transform = NULL;
+                                this->desc.tfm = NULL;
+                        }
+
+                        if (this->buf) {
+                                toi_free_page(27, (unsigned long) this->buf);
+                                this->buf = NULL;
+                        }
+                }
+        }
+}
+
+/*
+ * toi_crypto_initialise
+ *
+ * Prepare to do some work by allocating buffers and transforms.
+ * Returns: Int: Zero. Even if we can't set up checksum, we still
+ * seek to hibernate.
+ */
+static int toi_checksum_initialise(int starting_cycle)
+{
+        int cpu;
+
+        if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled)
+                return 0;
+
+        if (!*toi_checksum_name) {
+                printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n");
+                return 1;
+        }
+
+        for_each_online_cpu(cpu) {
+                struct cpu_context *this = &per_cpu(contexts, cpu);
+                struct page *page;
+
+                this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0);
+                if (IS_ERR(this->transform)) {
+                        printk(KERN_INFO "TuxOnIce: Failed to initialise the "
+                                "%s checksum algorithm: %ld.\n",
+                                toi_checksum_name, (long) this->transform);
+                        this->transform = NULL;
+                        return 1;
+                }
+
+                this->desc.tfm = this->transform;
+                this->desc.flags = 0;
+
+                page = toi_alloc_page(27, GFP_KERNEL);
+                if (!page)
+                        return 1;
+                this->buf = page_address(page);
+                sg_init_one(&this->sg[0], this->buf, PAGE_SIZE);
+        }
+        return 0;
+}
+
+/*
+ * toi_checksum_print_debug_stats
+ * @buffer: Pointer to a buffer into which the debug info will be printed.
+ * @size: Size of the buffer.
+ *
+ * Print information to be recorded for debugging purposes into a buffer.
+ * Returns: Number of characters written to the buffer.
+ */
+
+static int toi_checksum_print_debug_stats(char *buffer, int size)
+{
+        int len;
+
+        if (!toi_checksum_ops.enabled)
+                return scnprintf(buffer, size,
+                        "- Checksumming disabled.\n");
+
+        len = scnprintf(buffer, size, "- Checksum method is '%s'.\n",
+                        toi_checksum_name);
+        len += scnprintf(buffer + len, size - len,
+                "  %d pages resaved in atomic copy.\n", toi_num_resaved);
+        return len;
+}
+
+static int toi_checksum_memory_needed(void)
+{
+        return toi_checksum_ops.enabled ?
+                checksum_pages_needed() << PAGE_SHIFT : 0;
+}
+
+static int toi_checksum_storage_needed(void)
+{
+        if (toi_checksum_ops.enabled)
+                return strlen(toi_checksum_name) + sizeof(int) + 1;
+        else
+                return 0;
+}
+
+/*
+ * toi_checksum_save_config_info
+ * @buffer: Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Save informaton needed when reloading the image at resume time.
+ * Returns: Number of bytes used for saving our data.
+ */
+static int toi_checksum_save_config_info(char *buffer)
+{
+        int namelen = strlen(toi_checksum_name) + 1;
+        int total_len;
+
+        *((unsigned int *) buffer) = namelen;
+        strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen);
+        total_len = sizeof(unsigned int) + namelen;
+        return total_len;
+}
+
+/* toi_checksum_load_config_info
+ * @buffer: Pointer to the start of the data.
+ * @size: Number of bytes that were saved.
+ *
+ * Description:        Reload information needed for dechecksuming the image at
+ * resume time.
+ */
+static void toi_checksum_load_config_info(char *buffer, int size)
+{
+        int namelen;
+
+        namelen = *((unsigned int *) (buffer));
+        strncpy(toi_checksum_name, buffer + sizeof(unsigned int),
+                        namelen);
+        return;
+}
+
+/*
+ * Free Checksum Memory
+ */
+
+void free_checksum_pages(void)
+{
+        while (pages_allocated) {
+                unsigned long next = *((unsigned long *) page_list);
+                ClearPageNosave(virt_to_page(page_list));
+                toi_free_page(15, (unsigned long) page_list);
+                page_list = next;
+                pages_allocated--;
+        }
+}
+
+/*
+ * Allocate Checksum Memory
+ */
+
+int allocate_checksum_pages(void)
+{
+        int pages_needed = checksum_pages_needed();
+
+        if (!toi_checksum_ops.enabled)
+                return 0;
+
+        while (pages_allocated < pages_needed) {
+                unsigned long *new_page =
+                  (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP);
+                if (!new_page) {
+                        printk(KERN_ERR "Unable to allocate checksum pages.\n");
+                        return -ENOMEM;
+                }
+                SetPageNosave(virt_to_page(new_page));
+                (*new_page) = page_list;
+                page_list = (unsigned long) new_page;
+                pages_allocated++;
+        }
+
+        next_page = (unsigned long) page_list;
+        checksum_count = 0;
+
+        return 0;
+}
+
+char *tuxonice_get_next_checksum(void)
+{
+        if (!toi_checksum_ops.enabled)
+                return NULL;
+
+        if (checksum_count % CHECKSUMS_PER_PAGE)
+                this_checksum += CHECKSUM_SIZE;
+        else {
+                this_checksum = next_page + sizeof(void *);
+                next_page = *((unsigned long *) next_page);
+        }
+
+        checksum_count++;
+        return (char *) this_checksum;
+}
+
+int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
+{
+        char *pa;
+        int result, cpu = smp_processor_id();
+        struct cpu_context *ctx = &per_cpu(contexts, cpu);
+
+        if (!toi_checksum_ops.enabled)
+                return 0;
+
+        pa = kmap(page);
+        memcpy(ctx->buf, pa, PAGE_SIZE);
+        kunmap(page);
+        result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
+                                                checksum_locn);
+        if (result)
+                printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest "
+                                "returned %d.\n", result);
+        return result;
+}
+/*
+ * Calculate checksums
+ */
+
+void check_checksums(void)
+{
+        int index = 0, cpu = smp_processor_id();
+        char current_checksum[CHECKSUM_SIZE];
+        struct cpu_context *ctx = &per_cpu(contexts, cpu);
+        unsigned long pfn;
+
+        if (!toi_checksum_ops.enabled) {
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled.");
+                return;
+        }
+
+        next_page = (unsigned long) page_list;
+
+        toi_num_resaved = 0;
+        this_checksum = 0;
+
+        toi_trace_index++;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums.");
+        memory_bm_position_reset(pageset2_map);
+        for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP;
+                        pfn = memory_bm_next_pfn(pageset2_map, 0)) {
+                int ret, resave_needed = false;
+                char *pa;
+                struct page *page = pfn_to_page(pfn);
+
+                if (index < checksum_count) {
+                    if (index % CHECKSUMS_PER_PAGE) {
+                        this_checksum += CHECKSUM_SIZE;
+                    } else {
+                        this_checksum = next_page + sizeof(void *);
+                        next_page = *((unsigned long *) next_page);
+                    }
+
+                    /* Done when IRQs disabled so must be atomic */
+                    pa = kmap_atomic(page);
+                    memcpy(ctx->buf, pa, PAGE_SIZE);
+                    kunmap_atomic(pa);
+                    ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
+                            current_checksum);
+
+                    if (ret) {
+                        printk(KERN_INFO "Digest failed. Returned %d.\n", ret);
+                        return;
+                    }
+
+                    resave_needed = memcmp(current_checksum, (char *) this_checksum,
+                            CHECKSUM_SIZE);
+                } else {
+                    resave_needed = true;
+                }
+
+                if (resave_needed) {
+                        TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed);
+                        SetPageResave(pfn_to_page(pfn));
+                        toi_num_resaved++;
+                        if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED))
+                                set_abort_result(TOI_RESAVE_NEEDED);
+                }
+
+                index++;
+        }
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete.");
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0,
+                        NULL),
+        SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_ABORT_ON_RESAVE_NEEDED, 0)
+};
+
+/*
+ * Ops structure.
+ */
+static struct toi_module_ops toi_checksum_ops = {
+        .type                        = MISC_MODULE,
+        .name                        = "checksumming",
+        .directory                = "checksum",
+        .module                        = THIS_MODULE,
+        .initialise                = toi_checksum_initialise,
+        .cleanup                = toi_checksum_cleanup,
+        .print_debug_info        = toi_checksum_print_debug_stats,
+        .save_config_info        = toi_checksum_save_config_info,
+        .load_config_info        = toi_checksum_load_config_info,
+        .memory_needed                = toi_checksum_memory_needed,
+        .storage_needed                = toi_checksum_storage_needed,
+
+        .sysfs_data                = sysfs_params,
+        .num_sysfs_entries        = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+int toi_checksum_init(void)
+{
+        int result = toi_register_module(&toi_checksum_ops);
+        return result;
+}
+
+void toi_checksum_exit(void)
+{
+        toi_unregister_module(&toi_checksum_ops);
+}
diff --git b/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h
new file mode 100644
index 0000000..c8196fb
--- /dev/null
+++ b/kernel/power/tuxonice_checksum.h
@@ -0,0 +1,31 @@
+/*
+ * kernel/power/tuxonice_checksum.h
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains data checksum routines for TuxOnIce,
+ * using cryptoapi. They are used to locate any modifications
+ * made to pageset 2 while we're saving it.
+ */
+
+#if defined(CONFIG_TOI_CHECKSUM)
+extern int toi_checksum_init(void);
+extern void toi_checksum_exit(void);
+void check_checksums(void);
+int allocate_checksum_pages(void);
+void free_checksum_pages(void);
+char *tuxonice_get_next_checksum(void);
+int tuxonice_calc_checksum(struct page *page, char *checksum_locn);
+#else
+static inline int toi_checksum_init(void) { return 0; }
+static inline void toi_checksum_exit(void) { }
+static inline void check_checksums(void) { };
+static inline int allocate_checksum_pages(void) { return 0; };
+static inline void free_checksum_pages(void) { };
+static inline char *tuxonice_get_next_checksum(void) { return NULL; };
+static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
+        { return 0; }
+#endif
+
diff --git b/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
new file mode 100644
index 0000000..2873f93
--- /dev/null
+++ b/kernel/power/tuxonice_cluster.c
@@ -0,0 +1,1058 @@
+/*
+ * kernel/power/tuxonice_cluster.c
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains routines for cluster hibernation support.
+ *
+ * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
+ *
+ * How does it work?
+ *
+ * There is no 'master' node that tells everyone else what to do. All nodes
+ * send messages to the broadcast address/port, maintain a list of peers
+ * and figure out when to progress to the next step in hibernating or resuming.
+ * This makes us more fault tolerant when it comes to nodes coming and going
+ * (which may be more of an issue if we're hibernating when power supplies
+ * are being unreliable).
+ *
+ * At boot time, we start a ktuxonice thread that handles communication with
+ * other nodes. This node maintains a state machine that controls our progress
+ * through hibernating and resuming, keeping us in step with other nodes. Nodes
+ * are identified by their hw address.
+ *
+ * On startup, the node sends CLUSTER_PING on the configured interface's
+ * broadcast address, port $toi_cluster_port (see below) and begins to listen
+ * for other broadcast messages. CLUSTER_PING messages are repeated at
+ * intervals of 5 minutes, with a random offset to spread traffic out.
+ *
+ * A hibernation cycle is initiated from any node via
+ *
+ * echo > /sys/power/tuxonice/do_hibernate
+ *
+ * and (possibily) the hibernate script. At each step of the process, the node
+ * completes its work, and waits for all other nodes to signal completion of
+ * their work (or timeout) before progressing to the next step.
+ *
+ * Request/state  Action before reply        Possible reply        Next state
+ * HIBERNATE          capable, pre-script        HIBERNATE|ACK        NODE_PREP
+ *                                         HIBERNATE|NACK        INIT_0
+ *
+ * PREP                  prepare_image                PREP|ACK        IMAGE_WRITE
+ *                                         PREP|NACK        INIT_0
+ *                                         ABORT                RUNNING
+ *
+ * IO                  write image                IO|ACK                power off
+ *                                         ABORT                POST_RESUME
+ *
+ * (Boot time)          check for image        IMAGE|ACK        RESUME_PREP
+ *                                         (Note 1)
+ *                                         IMAGE|NACK        (Note 2)
+ *
+ * PREP                  prepare read image        PREP|ACK        IMAGE_READ
+ *                                         PREP|NACK        (As NACK_IMAGE)
+ *
+ * IO                  read image                IO|ACK                POST_RESUME
+ *
+ * POST_RESUME          thaw, post-script                        RUNNING
+ *
+ * INIT_0          init 0
+ *
+ * Other messages:
+ *
+ * - PING: Request for all other live nodes to send a PONG. Used at startup to
+ *   announce presence, when a node is suspected dead and periodically, in case
+ *   segments of the network are [un]plugged.
+ *
+ * - PONG: Response to a PING.
+ *
+ * - ABORT: Request to cancel writing an image.
+ *
+ * - BYE: Notification that this node is shutting down.
+ *
+ * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
+ * nodes which are slower to start up can get state synchronised. If a node
+ * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
+ * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
+ * must invalidate its image (if any) and boot normally.
+ *
+ * Note 2: May occur when one node lost power or powered off while others
+ * hibernated. This node waits for others to complete resuming (ACK_READ)
+ * before completing its boot, so that it appears as a fail node restarting.
+ *
+ * If any node has an image, then it also has a list of nodes that hibernated
+ * in synchronisation with it. The node will wait for other nodes to appear
+ * or timeout before beginning its restoration.
+ *
+ * If a node has no image, it needs to wait, in case other nodes which do have
+ * an image are going to resume, but are taking longer to announce their
+ * presence. For this reason, the user can specify a timeout value and a number
+ * of nodes detected before we just continue. (We might want to assume in a
+ * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
+ * the remaining nodes will too. This might help in situations where some nodes
+ * are much slower to boot, or more subject to hardware failures or such like).
+ */
+
+#include <linux/suspend.h>
+#include <linux/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_io.h"
+
+#if 1
+#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
+#else
+#define PRINTK(a, b...) do { } while (0)
+#endif
+
+static int loopback_mode;
+static int num_local_nodes = 1;
+#define MAX_LOCAL_NODES 8
+#define SADDR (loopback_mode ? b->sid : h->saddr)
+
+#define MYNAME "TuxOnIce Clustering"
+
+enum cluster_message {
+        MSG_ACK = 1,
+        MSG_NACK = 2,
+        MSG_PING = 4,
+        MSG_ABORT = 8,
+        MSG_BYE = 16,
+        MSG_HIBERNATE = 32,
+        MSG_IMAGE = 64,
+        MSG_IO = 128,
+        MSG_RUNNING = 256
+};
+
+static char *str_message(int message)
+{
+        switch (message) {
+        case 4:
+                return "Ping";
+        case 8:
+                return "Abort";
+        case 9:
+                return "Abort acked";
+        case 10:
+                return "Abort nacked";
+        case 16:
+                return "Bye";
+        case 17:
+                return "Bye acked";
+        case 18:
+                return "Bye nacked";
+        case 32:
+                return "Hibernate request";
+        case 33:
+                return "Hibernate ack";
+        case 34:
+                return "Hibernate nack";
+        case 64:
+                return "Image exists?";
+        case 65:
+                return "Image does exist";
+        case 66:
+                return "No image here";
+        case 128:
+                return "I/O";
+        case 129:
+                return "I/O okay";
+        case 130:
+                return "I/O failed";
+        case 256:
+                return "Running";
+        default:
+                printk(KERN_ERR "Unrecognised message %d.\n", message);
+                return "Unrecognised message (see dmesg)";
+        }
+}
+
+#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
+#define MSG_STATE_MASK (~MSG_ACK_MASK)
+
+struct node_info {
+        struct list_head member_list;
+        wait_queue_head_t member_events;
+        spinlock_t member_list_lock;
+        spinlock_t receive_lock;
+        int peer_count, ignored_peer_count;
+        struct toi_sysfs_data sysfs_data;
+        enum cluster_message current_message;
+};
+
+struct node_info node_array[MAX_LOCAL_NODES];
+
+struct cluster_member {
+        __be32 addr;
+        enum cluster_message message;
+        struct list_head list;
+        int ignore;
+};
+
+#define toi_cluster_port_send 3501
+#define toi_cluster_port_recv 3502
+
+static struct net_device *net_dev;
+static struct toi_module_ops toi_cluster_ops;
+
+static int toi_recv(struct sk_buff *skb, struct net_device *dev,
+                struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type toi_cluster_packet_type = {
+        .type =        __constant_htons(ETH_P_IP),
+        .func =        toi_recv,
+};
+
+struct toi_pkt {                /* BOOTP packet format */
+        struct iphdr iph;        /* IP header */
+        struct udphdr udph;        /* UDP header */
+        u8 htype;                /* HW address type */
+        u8 hlen;                /* HW address length */
+        __be32 xid;                /* Transaction ID */
+        __be16 secs;                /* Seconds since we started */
+        __be16 flags;                /* Just what it says */
+        u8 hw_addr[16];                /* Sender's HW address */
+        u16 message;                /* Message */
+        unsigned long sid;        /* Source ID for loopback testing */
+};
+
+static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
+
+static int added_pack;
+
+static int others_have_image;
+
+/* Key used to allow multiple clusters on the same lan */
+static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
+static char pre_hibernate_script[255] =
+        CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
+static char post_hibernate_script[255] =
+        CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
+
+/*                        List of cluster members                        */
+static unsigned long continue_delay = 5 * HZ;
+static unsigned long cluster_message_timeout = 3 * HZ;
+
+/*                 === Membership list ===         */
+
+static void print_member_info(int index)
+{
+        struct cluster_member *this;
+
+        printk(KERN_INFO "==> Dumping node %d.\n", index);
+
+        list_for_each_entry(this, &node_array[index].member_list, list)
+                printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
+                                NIPQUAD(this->addr),
+                                str_message(this->message),
+                                this->ignore ? "(Ignored)" : "");
+        printk(KERN_INFO "== Done ==\n");
+}
+
+static struct cluster_member *__find_member(int index, __be32 addr)
+{
+        struct cluster_member *this;
+
+        list_for_each_entry(this, &node_array[index].member_list, list) {
+                if (this->addr != addr)
+                        continue;
+
+                return this;
+        }
+
+        return NULL;
+}
+
+static void set_ignore(int index, __be32 addr, struct cluster_member *this)
+{
+        if (this->ignore) {
+                PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
+                                index, NIPQUAD(addr));
+                return;
+        }
+
+        PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
+                                index, NIPQUAD(addr));
+        this->ignore = 1;
+        node_array[index].ignored_peer_count++;
+}
+
+static int __add_update_member(int index, __be32 addr, int message)
+{
+        struct cluster_member *this;
+
+        this = __find_member(index, addr);
+        if (this) {
+                if (this->message != message) {
+                        this->message = message;
+                        if ((message & MSG_NACK) &&
+                            (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
+                                set_ignore(index, addr, this);
+                        PRINTK("Node %d sees node %d.%d.%d.%d now sending "
+                                        "%s.\n", index, NIPQUAD(addr),
+                                        str_message(message));
+                        wake_up(&node_array[index].member_events);
+                }
+                return 0;
+        }
+
+        this = (struct cluster_member *) toi_kzalloc(36,
+                        sizeof(struct cluster_member), GFP_KERNEL);
+
+        if (!this)
+                return -1;
+
+        this->addr = addr;
+        this->message = message;
+        this->ignore = 0;
+        INIT_LIST_HEAD(&this->list);
+
+        node_array[index].peer_count++;
+
+        PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
+                        NIPQUAD(addr), str_message(message));
+
+        if ((message & MSG_NACK) &&
+            (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
+                set_ignore(index, addr, this);
+        list_add_tail(&this->list, &node_array[index].member_list);
+        return 1;
+}
+
+static int add_update_member(int index, __be32 addr, int message)
+{
+        int result;
+        unsigned long flags;
+        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+        result = __add_update_member(index, addr, message);
+        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+
+        print_member_info(index);
+
+        wake_up(&node_array[index].member_events);
+
+        return result;
+}
+
+static void del_member(int index, __be32 addr)
+{
+        struct cluster_member *this;
+        unsigned long flags;
+
+        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+        this = __find_member(index, addr);
+
+        if (this) {
+                list_del_init(&this->list);
+                toi_kfree(36, this, sizeof(*this));
+                node_array[index].peer_count--;
+        }
+
+        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+}
+
+/*                 === Message transmission ===        */
+
+static void toi_send_if(int message, unsigned long my_id);
+
+/*
+ *  Process received TOI packet.
+ */
+static int toi_recv(struct sk_buff *skb, struct net_device *dev,
+                struct packet_type *pt, struct net_device *orig_dev)
+{
+        struct toi_pkt *b;
+        struct iphdr *h;
+        int len, result, index;
+        unsigned long addr, message, ack;
+
+        /* Perform verifications before taking the lock.  */
+        if (skb->pkt_type == PACKET_OTHERHOST)
+                goto drop;
+
+        if (dev != net_dev)
+                goto drop;
+
+        skb = skb_share_check(skb, GFP_ATOMIC);
+        if (!skb)
+                return NET_RX_DROP;
+
+        if (!pskb_may_pull(skb,
+                           sizeof(struct iphdr) +
+                           sizeof(struct udphdr)))
+                goto drop;
+
+        b = (struct toi_pkt *)skb_network_header(skb);
+        h = &b->iph;
+
+        if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
+                goto drop;
+
+        /* Fragments are not supported */
+        if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
+                if (net_ratelimit())
+                        printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
+                               "cluster message.\n");
+                goto drop;
+        }
+
+        if (skb->len < ntohs(h->tot_len))
+                goto drop;
+
+        if (ip_fast_csum((char *) h, h->ihl))
+                goto drop;
+
+        if (b->udph.source != htons(toi_cluster_port_send) ||
+            b->udph.dest != htons(toi_cluster_port_recv))
+                goto drop;
+
+        if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+                goto drop;
+
+        len = ntohs(b->udph.len) - sizeof(struct udphdr);
+
+        /* Ok the front looks good, make sure we can get at the rest.  */
+        if (!pskb_may_pull(skb, skb->len))
+                goto drop;
+
+        b = (struct toi_pkt *)skb_network_header(skb);
+        h = &b->iph;
+
+        addr = SADDR;
+        PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
+                        str_message(b->message), NIPQUAD(addr));
+
+        message = b->message & MSG_STATE_MASK;
+        ack = b->message & MSG_ACK_MASK;
+
+        for (index = 0; index < num_local_nodes; index++) {
+                int new_message = node_array[index].current_message,
+                    old_message = new_message;
+
+                if (index == SADDR || !old_message) {
+                        PRINTK("Ignoring node %d (offline or self).\n", index);
+                        continue;
+                }
+
+                /* One message at a time, please. */
+                spin_lock(&node_array[index].receive_lock);
+
+                result = add_update_member(index, SADDR, b->message);
+                if (result == -1) {
+                        printk(KERN_INFO "Failed to add new cluster member "
+                                        NIPQUAD_FMT ".\n",
+                                        NIPQUAD(addr));
+                        goto drop_unlock;
+                }
+
+                switch (b->message & MSG_STATE_MASK) {
+                case MSG_PING:
+                        break;
+                case MSG_ABORT:
+                        break;
+                case MSG_BYE:
+                        break;
+                case MSG_HIBERNATE:
+                        /* Can I hibernate? */
+                        new_message = MSG_HIBERNATE |
+                                ((index & 1) ? MSG_NACK : MSG_ACK);
+                        break;
+                case MSG_IMAGE:
+                        /* Can I resume? */
+                        new_message = MSG_IMAGE |
+                                ((index & 1) ? MSG_NACK : MSG_ACK);
+                        if (new_message != old_message)
+                                printk(KERN_ERR "Setting whether I can resume "
+                                                "to %d.\n", new_message);
+                        break;
+                case MSG_IO:
+                        new_message = MSG_IO | MSG_ACK;
+                        break;
+                case MSG_RUNNING:
+                        break;
+                default:
+                        if (net_ratelimit())
+                                printk(KERN_ERR "Unrecognised TuxOnIce cluster"
+                                        " message %d from " NIPQUAD_FMT ".\n",
+                                        b->message, NIPQUAD(addr));
+                };
+
+                if (old_message != new_message) {
+                        node_array[index].current_message = new_message;
+                        printk(KERN_INFO ">>> Sending new message for node "
+                                        "%d.\n", index);
+                        toi_send_if(new_message, index);
+                } else if (!ack) {
+                        printk(KERN_INFO ">>> Resending message for node %d.\n",
+                                        index);
+                        toi_send_if(new_message, index);
+                }
+drop_unlock:
+                spin_unlock(&node_array[index].receive_lock);
+        };
+
+drop:
+        /* Throw the packet out. */
+        kfree_skb(skb);
+
+        return 0;
+}
+
+/*
+ *  Send cluster message to single interface.
+ */
+static void toi_send_if(int message, unsigned long my_id)
+{
+        struct sk_buff *skb;
+        struct toi_pkt *b;
+        int hh_len = LL_RESERVED_SPACE(net_dev);
+        struct iphdr *h;
+
+        /* Allocate packet */
+        skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
+        if (!skb)
+                return;
+        skb_reserve(skb, hh_len);
+        b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
+        memset(b, 0, sizeof(struct toi_pkt));
+
+        /* Construct IP header */
+        skb_reset_network_header(skb);
+        h = ip_hdr(skb);
+        h->version = 4;
+        h->ihl = 5;
+        h->tot_len = htons(sizeof(struct toi_pkt));
+        h->frag_off = htons(IP_DF);
+        h->ttl = 64;
+        h->protocol = IPPROTO_UDP;
+        h->daddr = htonl(INADDR_BROADCAST);
+        h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+
+        /* Construct UDP header */
+        b->udph.source = htons(toi_cluster_port_send);
+        b->udph.dest = htons(toi_cluster_port_recv);
+        b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
+        /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+
+        /* Construct message */
+        b->message = message;
+        b->sid = my_id;
+        b->htype = net_dev->type; /* can cause undefined behavior */
+        b->hlen = net_dev->addr_len;
+        memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
+        b->secs = htons(3); /* 3 seconds */
+
+        /* Chain packet down the line... */
+        skb->dev = net_dev;
+        skb->protocol = htons(ETH_P_IP);
+        if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
+                     net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
+                        dev_queue_xmit(skb) < 0)
+                printk(KERN_INFO "E");
+}
+
+/*        =========================================                */
+
+/*                        kTOICluster                        */
+
+static atomic_t num_cluster_threads;
+static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
+
+static int kTOICluster(void *data)
+{
+        unsigned long my_id;
+
+        my_id = atomic_add_return(1, &num_cluster_threads) - 1;
+        node_array[my_id].current_message = (unsigned long) data;
+
+        PRINTK("kTOICluster daemon %lu starting.\n", my_id);
+
+        current->flags |= PF_NOFREEZE;
+
+        while (node_array[my_id].current_message) {
+                toi_send_if(node_array[my_id].current_message, my_id);
+                sleep_on_timeout(&clusterd_events,
+                                cluster_message_timeout);
+                PRINTK("Link state %lu is %d.\n", my_id,
+                                node_array[my_id].current_message);
+        }
+
+        toi_send_if(MSG_BYE, my_id);
+        atomic_dec(&num_cluster_threads);
+        wake_up(&clusterd_events);
+
+        PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
+        __set_current_state(TASK_RUNNING);
+        return 0;
+}
+
+static void kill_clusterd(void)
+{
+        int i;
+
+        for (i = 0; i < num_local_nodes; i++) {
+                if (node_array[i].current_message) {
+                        PRINTK("Seeking to kill clusterd %d.\n", i);
+                        node_array[i].current_message = 0;
+                }
+        }
+        wait_event(clusterd_events,
+                        !atomic_read(&num_cluster_threads));
+        PRINTK("All cluster daemons have exited.\n");
+}
+
+static int peers_not_in_message(int index, int message, int precise)
+{
+        struct cluster_member *this;
+        unsigned long flags;
+        int result = 0;
+
+        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+        list_for_each_entry(this, &node_array[index].member_list, list) {
+                if (this->ignore)
+                        continue;
+
+                PRINTK("Peer %d.%d.%d.%d sending %s. "
+                        "Seeking %s.\n",
+                        NIPQUAD(this->addr),
+                        str_message(this->message), str_message(message));
+                if ((precise ? this->message :
+                                        this->message & MSG_STATE_MASK) !=
+                                        message)
+                        result++;
+        }
+        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+        PRINTK("%d peers in sought message.\n", result);
+        return result;
+}
+
+static void reset_ignored(int index)
+{
+        struct cluster_member *this;
+        unsigned long flags;
+
+        spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+        list_for_each_entry(this, &node_array[index].member_list, list)
+                this->ignore = 0;
+        node_array[index].ignored_peer_count = 0;
+        spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+}
+
+static int peers_in_message(int index, int message, int precise)
+{
+        return node_array[index].peer_count -
+                node_array[index].ignored_peer_count -
+                peers_not_in_message(index, message, precise);
+}
+
+static int time_to_continue(int index, unsigned long start, int message)
+{
+        int first = peers_not_in_message(index, message, 0);
+        int second = peers_in_message(index, message, 1);
+
+        PRINTK("First part returns %d, second returns %d.\n", first, second);
+
+        if (!first && !second) {
+                PRINTK("All peers answered message %d.\n",
+                        message);
+                return 1;
+        }
+
+        if (time_after(jiffies, start + continue_delay)) {
+                PRINTK("Timeout reached.\n");
+                return 1;
+        }
+
+        PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
+                        start + continue_delay);
+        return 0;
+}
+
+void toi_initiate_cluster_hibernate(void)
+{
+        int result;
+        unsigned long start;
+
+        result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
+        if (result)
+                return;
+
+        toi_send_if(MSG_HIBERNATE, 0);
+
+        start = jiffies;
+        wait_event(node_array[0].member_events,
+                        time_to_continue(0, start, MSG_HIBERNATE));
+
+        if (test_action_state(TOI_FREEZER_TEST)) {
+                toi_send_if(MSG_ABORT, 0);
+
+                start = jiffies;
+                wait_event(node_array[0].member_events,
+                        time_to_continue(0, start, MSG_RUNNING));
+
+                do_toi_step(STEP_QUIET_CLEANUP);
+                return;
+        }
+
+        toi_send_if(MSG_IO, 0);
+
+        result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
+        if (result)
+                return;
+
+        /* This code runs at resume time too! */
+        if (toi_in_hibernate)
+                result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
+}
+
+/* toi_cluster_print_debug_stats
+ *
+ * Description:        Print information to be recorded for debugging purposes into a
+ *                 buffer.
+ * Arguments:        buffer: Pointer to a buffer into which the debug info will be
+ *                         printed.
+ *                 size:        Size of the buffer.
+ * Returns:        Number of characters written to the buffer.
+ */
+static int toi_cluster_print_debug_stats(char *buffer, int size)
+{
+        int len;
+
+        if (strlen(toi_cluster_iface))
+                len = scnprintf(buffer, size,
+                                "- Cluster interface is '%s'.\n",
+                                toi_cluster_iface);
+        else
+                len = scnprintf(buffer, size,
+                                "- Cluster support is disabled.\n");
+        return len;
+}
+
+/* cluster_memory_needed
+ *
+ * Description:        Tell the caller how much memory we need to operate during
+ *                 hibernate/resume.
+ * Returns:        Unsigned long. Maximum number of bytes of memory required for
+ *                 operation.
+ */
+static int toi_cluster_memory_needed(void)
+{
+        return 0;
+}
+
+static int toi_cluster_storage_needed(void)
+{
+        return 1 + strlen(toi_cluster_iface);
+}
+
+/* toi_cluster_save_config_info
+ *
+ * Description:        Save informaton needed when reloading the image at resume time.
+ * Arguments:        Buffer:                Pointer to a buffer of size PAGE_SIZE.
+ * Returns:        Number of bytes used for saving our data.
+ */
+static int toi_cluster_save_config_info(char *buffer)
+{
+        strcpy(buffer, toi_cluster_iface);
+        return strlen(toi_cluster_iface + 1);
+}
+
+/* toi_cluster_load_config_info
+ *
+ * Description:        Reload information needed for declustering the image at
+ *                 resume time.
+ * Arguments:        Buffer:                Pointer to the start of the data.
+ *                Size:                Number of bytes that were saved.
+ */
+static void toi_cluster_load_config_info(char *buffer, int size)
+{
+        strncpy(toi_cluster_iface, buffer, size);
+        return;
+}
+
+static void cluster_startup(void)
+{
+        int have_image = do_check_can_resume(), i;
+        unsigned long start = jiffies, initial_message;
+        struct task_struct *p;
+
+        initial_message = MSG_IMAGE;
+
+        have_image = 1;
+
+        for (i = 0; i < num_local_nodes; i++) {
+                PRINTK("Starting ktoiclusterd %d.\n", i);
+                p = kthread_create(kTOICluster, (void *) initial_message,
+                                "ktoiclusterd/%d", i);
+                if (IS_ERR(p)) {
+                        printk(KERN_ERR "Failed to start ktoiclusterd.\n");
+                        return;
+                }
+
+                wake_up_process(p);
+        }
+
+        /* Wait for delay or someone else sending first message */
+        wait_event(node_array[0].member_events, time_to_continue(0, start,
+                                MSG_IMAGE));
+
+        others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
+
+        printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
+                " %d.\n", have_image ? "" : "don't ", others_have_image);
+
+        if (have_image) {
+                int result;
+
+                /* Start to resume */
+                printk(KERN_INFO "  === Starting to resume ===  \n");
+                node_array[0].current_message = MSG_IO;
+                toi_send_if(MSG_IO, 0);
+
+                /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
+                result = 0;
+
+                if (!result) {
+                        /*
+                         * Atomic restore - we'll come back in the hibernation
+                         * path.
+                         */
+
+                        /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
+                        result = 0;
+
+                        /* do_toi_step(STEP_QUIET_CLEANUP); */
+                }
+
+                node_array[0].current_message |= MSG_NACK;
+
+                /* For debugging - disable for real life? */
+                wait_event(node_array[0].member_events,
+                                time_to_continue(0, start, MSG_IO));
+        }
+
+        if (others_have_image) {
+                /* Wait for them to resume */
+                printk(KERN_INFO "Waiting for other nodes to resume.\n");
+                start = jiffies;
+                wait_event(node_array[0].member_events,
+                                time_to_continue(0, start, MSG_RUNNING));
+                if (peers_not_in_message(0, MSG_RUNNING, 0))
+                        printk(KERN_INFO "Timed out while waiting for other "
+                                        "nodes to resume.\n");
+        }
+
+        /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
+         * as appropriate.
+         *
+         * If we don't have an image:
+         * - Wait until someone else says they have one, or conditions are met
+         *   for continuing to boot (n machines or t seconds).
+         * - If anyone has an image, wait for them to resume before continuing
+         *   to boot.
+         *
+         * If we have an image:
+         * - Wait until conditions are met before continuing to resume (n
+         *   machines or t seconds). Send RESUME_PREP and freeze processes.
+         *   NACK_PREP if freezing fails (shouldn't) and follow logic for
+         *   us having no image above. On success, wait for [N]ACK_PREP from
+         *   other machines. Read image (including atomic restore) until done.
+         *   Wait for ACK_READ from others (should never fail). Thaw processes
+         *   and do post-resume. (The section after the atomic restore is done
+         *   via the code for hibernating).
+         */
+
+        node_array[0].current_message = MSG_RUNNING;
+}
+
+/* toi_cluster_open_iface
+ *
+ * Description:        Prepare to use an interface.
+ */
+
+static int toi_cluster_open_iface(void)
+{
+        struct net_device *dev;
+
+        rtnl_lock();
+
+        for_each_netdev(&init_net, dev) {
+                if (/* dev == &init_net.loopback_dev || */
+                    strcmp(dev->name, toi_cluster_iface))
+                        continue;
+
+                net_dev = dev;
+                break;
+        }
+
+        rtnl_unlock();
+
+        if (!net_dev) {
+                printk(KERN_ERR MYNAME ": Device %s not found.\n",
+                                toi_cluster_iface);
+                return -ENODEV;
+        }
+
+        dev_add_pack(&toi_cluster_packet_type);
+        added_pack = 1;
+
+        loopback_mode = (net_dev == init_net.loopback_dev);
+        num_local_nodes = loopback_mode ? 8 : 1;
+
+        PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
+                        loopback_mode ? "on" : "off", num_local_nodes);
+
+        cluster_startup();
+        return 0;
+}
+
+/* toi_cluster_close_iface
+ *
+ * Description: Stop using an interface.
+ */
+
+static int toi_cluster_close_iface(void)
+{
+        kill_clusterd();
+        if (added_pack) {
+                dev_remove_pack(&toi_cluster_packet_type);
+                added_pack = 0;
+        }
+        return 0;
+}
+
+static void write_side_effect(void)
+{
+        if (toi_cluster_ops.enabled) {
+                toi_cluster_open_iface();
+                set_toi_state(TOI_CLUSTER_MODE);
+        } else {
+                toi_cluster_close_iface();
+                clear_toi_state(TOI_CLUSTER_MODE);
+        }
+}
+
+static void node_write_side_effect(void)
+{
+}
+
+/*
+ * data for our sysfs entries.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
+                        NULL),
+        SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
+                        write_side_effect),
+        SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
+        SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
+                        256, 0, NULL),
+        SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
+                        256, 0, STRING),
+        SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
+                        0)
+};
+
+/*
+ * Ops structure.
+ */
+
+static struct toi_module_ops toi_cluster_ops = {
+        .type                        = FILTER_MODULE,
+        .name                        = "Cluster",
+        .directory                = "cluster",
+        .module                        = THIS_MODULE,
+        .memory_needed                 = toi_cluster_memory_needed,
+        .print_debug_info        = toi_cluster_print_debug_stats,
+        .save_config_info        = toi_cluster_save_config_info,
+        .load_config_info        = toi_cluster_load_config_info,
+        .storage_needed                = toi_cluster_storage_needed,
+
+        .sysfs_data                = sysfs_params,
+        .num_sysfs_entries        = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+
+#ifdef MODULE
+#define INIT static __init
+#define EXIT static __exit
+#else
+#define INIT
+#define EXIT
+#endif
+
+INIT int toi_cluster_init(void)
+{
+        int temp = toi_register_module(&toi_cluster_ops), i;
+        struct kobject *kobj = toi_cluster_ops.dir_kobj;
+
+        for (i = 0; i < MAX_LOCAL_NODES; i++) {
+                node_array[i].current_message = 0;
+                INIT_LIST_HEAD(&node_array[i].member_list);
+                init_waitqueue_head(&node_array[i].member_events);
+                spin_lock_init(&node_array[i].member_list_lock);
+                spin_lock_init(&node_array[i].receive_lock);
+
+                /* Set up sysfs entry */
+                node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
+                                sizeof(node_array[i].sysfs_data.attr.name),
+                                GFP_KERNEL);
+                sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
+                                i);
+                node_array[i].sysfs_data.attr.mode = SYSFS_RW;
+                node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
+                node_array[i].sysfs_data.flags = 0;
+                node_array[i].sysfs_data.data.integer.variable =
+                        (int *) &node_array[i].current_message;
+                node_array[i].sysfs_data.data.integer.minimum = 0;
+                node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
+                node_array[i].sysfs_data.write_side_effect =
+                        node_write_side_effect;
+                toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
+        }
+
+        toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
+
+        if (toi_cluster_ops.enabled)
+                toi_cluster_open_iface();
+
+        return temp;
+}
+
+EXIT void toi_cluster_exit(void)
+{
+        int i;
+        toi_cluster_close_iface();
+
+        for (i = 0; i < MAX_LOCAL_NODES; i++)
+                toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
+                                &node_array[i].sysfs_data);
+        toi_unregister_module(&toi_cluster_ops);
+}
+
+static int __init toi_cluster_iface_setup(char *iface)
+{
+        toi_cluster_ops.enabled = (*iface &&
+                        strcmp(iface, "off"));
+
+        if (toi_cluster_ops.enabled)
+                strncpy(toi_cluster_iface, iface, strlen(iface));
+}
+
+__setup("toi_cluster=", toi_cluster_iface_setup);
diff --git b/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h
new file mode 100644
index 0000000..84356b3
--- /dev/null
+++ b/kernel/power/tuxonice_cluster.h
@@ -0,0 +1,18 @@
+/*
+ * kernel/power/tuxonice_cluster.h
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifdef CONFIG_TOI_CLUSTER
+extern int toi_cluster_init(void);
+extern void toi_cluster_exit(void);
+extern void toi_initiate_cluster_hibernate(void);
+#else
+static inline int toi_cluster_init(void) { return 0; }
+static inline void toi_cluster_exit(void) { }
+static inline void toi_initiate_cluster_hibernate(void) { }
+#endif
+
diff --git b/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
new file mode 100644
index 0000000..84b8522
--- /dev/null
+++ b/kernel/power/tuxonice_compress.c
@@ -0,0 +1,452 @@
+/*
+ * kernel/power/compression.c
+ *
+ * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains data compression routines for TuxOnIce,
+ * using cryptoapi.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/crypto.h>
+
+#include "tuxonice_builtin.h"
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_alloc.h"
+
+static int toi_expected_compression;
+
+static struct toi_module_ops toi_compression_ops;
+static struct toi_module_ops *next_driver;
+
+static char toi_compressor_name[32] = "lzo";
+
+static DEFINE_MUTEX(stats_lock);
+
+struct cpu_context {
+        u8 *page_buffer;
+        struct crypto_comp *transform;
+        unsigned int len;
+        u8 *buffer_start;
+        u8 *output_buffer;
+};
+
+#define OUT_BUF_SIZE (2 * PAGE_SIZE)
+
+static DEFINE_PER_CPU(struct cpu_context, contexts);
+
+/*
+ * toi_crypto_prepare
+ *
+ * Prepare to do some work by allocating buffers and transforms.
+ */
+static int toi_compress_crypto_prepare(void)
+{
+        int cpu;
+
+        if (!*toi_compressor_name) {
+                printk(KERN_INFO "TuxOnIce: Compression enabled but no "
+                                "compressor name set.\n");
+                return 1;
+        }
+
+        for_each_online_cpu(cpu) {
+                struct cpu_context *this = &per_cpu(contexts, cpu);
+                this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
+                if (IS_ERR(this->transform)) {
+                        printk(KERN_INFO "TuxOnIce: Failed to initialise the "
+                                        "%s compression transform.\n",
+                                        toi_compressor_name);
+                        this->transform = NULL;
+                        return 1;
+                }
+
+                this->page_buffer =
+                        (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
+
+                if (!this->page_buffer) {
+                        printk(KERN_ERR
+                          "Failed to allocate a page buffer for TuxOnIce "
+                          "compression driver.\n");
+                        return -ENOMEM;
+                }
+
+                this->output_buffer =
+                        (char *) vmalloc_32(OUT_BUF_SIZE);
+
+                if (!this->output_buffer) {
+                        printk(KERN_ERR
+                          "Failed to allocate a output buffer for TuxOnIce "
+                          "compression driver.\n");
+                        return -ENOMEM;
+                }
+        }
+
+        return 0;
+}
+
+static int toi_compress_rw_cleanup(int writing)
+{
+        int cpu;
+
+        for_each_online_cpu(cpu) {
+                struct cpu_context *this = &per_cpu(contexts, cpu);
+                if (this->transform) {
+                        crypto_free_comp(this->transform);
+                        this->transform = NULL;
+                }
+
+                if (this->page_buffer)
+                        toi_free_page(16, (unsigned long) this->page_buffer);
+
+                this->page_buffer = NULL;
+
+                if (this->output_buffer)
+                        vfree(this->output_buffer);
+
+                this->output_buffer = NULL;
+        }
+
+        return 0;
+}
+
+/*
+ * toi_compress_init
+ */
+
+static int toi_compress_init(int toi_or_resume)
+{
+        if (!toi_or_resume)
+                return 0;
+
+        toi_compress_bytes_in = 0;
+        toi_compress_bytes_out = 0;
+
+        next_driver = toi_get_next_filter(&toi_compression_ops);
+
+        return next_driver ? 0 : -ECHILD;
+}
+
+/*
+ * toi_compress_rw_init()
+ */
+
+static int toi_compress_rw_init(int rw, int stream_number)
+{
+        if (toi_compress_crypto_prepare()) {
+                printk(KERN_ERR "Failed to initialise compression "
+                                "algorithm.\n");
+                if (rw == READ) {
+                        printk(KERN_INFO "Unable to read the image.\n");
+                        return -ENODEV;
+                } else {
+                        printk(KERN_INFO "Continuing without "
+                                "compressing the image.\n");
+                        toi_compression_ops.enabled = 0;
+                }
+        }
+
+        return 0;
+}
+
+/*
+ * toi_compress_write_page()
+ *
+ * Compress a page of data, buffering output and passing on filled
+ * pages to the next module in the pipeline.
+ *
+ * Buffer_page:        Pointer to a buffer of size PAGE_SIZE, containing
+ * data to be compressed.
+ *
+ * Returns:        0 on success. Otherwise the error is that returned by later
+ *                 modules, -ECHILD if we have a broken pipeline or -EIO if
+ *                 zlib errs.
+ */
+static int toi_compress_write_page(unsigned long index, int buf_type,
+                void *buffer_page, unsigned int buf_size)
+{
+        int ret = 0, cpu = smp_processor_id();
+        struct cpu_context *ctx = &per_cpu(contexts, cpu);
+        u8* output_buffer = buffer_page;
+        int output_len = buf_size;
+        int out_buf_type = buf_type;
+
+        if (ctx->transform) {
+
+                ctx->buffer_start = TOI_MAP(buf_type, buffer_page);
+                ctx->len = OUT_BUF_SIZE;
+
+                ret = crypto_comp_compress(ctx->transform,
+                        ctx->buffer_start, buf_size,
+                        ctx->output_buffer, &ctx->len);
+
+                TOI_UNMAP(buf_type, buffer_page);
+
+                toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
+                                "CPU %d, index %lu: %d bytes",
+                                cpu, index, ctx->len);
+
+                if (!ret && ctx->len < buf_size) { /* some compression */
+                        output_buffer = ctx->output_buffer;
+                        output_len = ctx->len;
+                        out_buf_type = TOI_VIRT;
+                }
+
+        }
+
+        mutex_lock(&stats_lock);
+
+        toi_compress_bytes_in += buf_size;
+        toi_compress_bytes_out += output_len;
+
+        mutex_unlock(&stats_lock);
+
+        if (!ret)
+                ret = next_driver->write_page(index, out_buf_type,
+                                output_buffer, output_len);
+
+        return ret;
+}
+
+/*
+ * toi_compress_read_page()
+ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Retrieve data from later modules and decompress it until the input buffer
+ * is filled.
+ * Zero if successful. Error condition from me or from downstream on failure.
+ */
+static int toi_compress_read_page(unsigned long *index, int buf_type,
+                void *buffer_page, unsigned int *buf_size)
+{
+        int ret, cpu = smp_processor_id();
+        unsigned int len;
+        unsigned int outlen = PAGE_SIZE;
+        char *buffer_start;
+        struct cpu_context *ctx = &per_cpu(contexts, cpu);
+
+        if (!ctx->transform)
+                return next_driver->read_page(index, TOI_PAGE, buffer_page,
+                                buf_size);
+
+        /*
+         * All our reads must be synchronous - we can't decompress
+         * data that hasn't been read yet.
+         */
+
+        ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len);
+
+        buffer_start = kmap(buffer_page);
+
+        /* Error or uncompressed data */
+        if (ret || len == PAGE_SIZE) {
+                memcpy(buffer_start, ctx->page_buffer, len);
+                goto out;
+        }
+
+        ret = crypto_comp_decompress(
+                        ctx->transform,
+                        ctx->page_buffer,
+                        len, buffer_start, &outlen);
+
+        toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
+                        "CPU %d, index %lu: %d=>%d (%d).",
+                        cpu, *index, len, outlen, ret);
+
+        if (ret)
+                abort_hibernate(TOI_FAILED_IO,
+                        "Compress_read returned %d.\n", ret);
+        else if (outlen != PAGE_SIZE) {
+                abort_hibernate(TOI_FAILED_IO,
+                        "Decompression yielded %d bytes instead of %ld.\n",
+                        outlen, PAGE_SIZE);
+                printk(KERN_ERR "Decompression yielded %d bytes instead of "
+                                "%ld.\n", outlen, PAGE_SIZE);
+                ret = -EIO;
+                *buf_size = outlen;
+        }
+out:
+        TOI_UNMAP(buf_type, buffer_page);
+        return ret;
+}
+
+/*
+ * toi_compress_print_debug_stats
+ * @buffer: Pointer to a buffer into which the debug info will be printed.
+ * @size: Size of the buffer.
+ *
+ * Print information to be recorded for debugging purposes into a buffer.
+ * Returns: Number of characters written to the buffer.
+ */
+
+static int toi_compress_print_debug_stats(char *buffer, int size)
+{
+        unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
+                      pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
+        int len;
+
+        /* Output the compression ratio achieved. */
+        if (*toi_compressor_name)
+                len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
+                                toi_compressor_name);
+        else
+                len = scnprintf(buffer, size, "- Compressor is not set.\n");
+
+        if (pages_in)
+                len += scnprintf(buffer+len, size - len, "  Compressed "
+                        "%lu bytes into %lu (%ld percent compression).\n",
+                  toi_compress_bytes_in,
+                  toi_compress_bytes_out,
+                  (pages_in - pages_out) * 100 / pages_in);
+        return len;
+}
+
+/*
+ * toi_compress_compression_memory_needed
+ *
+ * Tell the caller how much memory we need to operate during hibernate/resume.
+ * Returns: Unsigned long. Maximum number of bytes of memory required for
+ * operation.
+ */
+static int toi_compress_memory_needed(void)
+{
+        return 2 * PAGE_SIZE;
+}
+
+static int toi_compress_storage_needed(void)
+{
+        return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
+                strlen(toi_compressor_name) + 1;
+}
+
+/*
+ * toi_compress_save_config_info
+ * @buffer: Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Save informaton needed when reloading the image at resume time.
+ * Returns: Number of bytes used for saving our data.
+ */
+static int toi_compress_save_config_info(char *buffer)
+{
+        int len = strlen(toi_compressor_name) + 1, offset = 0;
+
+        *((unsigned long *) buffer) = toi_compress_bytes_in;
+        offset += sizeof(unsigned long);
+        *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out;
+        offset += sizeof(unsigned long);
+        *((int *) (buffer + offset)) = toi_expected_compression;
+        offset += sizeof(int);
+        *((int *) (buffer + offset)) = len;
+        offset += sizeof(int);
+        strncpy(buffer + offset, toi_compressor_name, len);
+        return offset + len;
+}
+
+/* toi_compress_load_config_info
+ * @buffer: Pointer to the start of the data.
+ * @size: Number of bytes that were saved.
+ *
+ * Description:        Reload information needed for decompressing the image at
+ * resume time.
+ */
+static void toi_compress_load_config_info(char *buffer, int size)
+{
+        int len, offset = 0;
+
+        toi_compress_bytes_in = *((unsigned long *) buffer);
+        offset += sizeof(unsigned long);
+        toi_compress_bytes_out = *((unsigned long *) (buffer + offset));
+        offset += sizeof(unsigned long);
+        toi_expected_compression = *((int *) (buffer + offset));
+        offset += sizeof(int);
+        len = *((int *) (buffer + offset));
+        offset += sizeof(int);
+        strncpy(toi_compressor_name, buffer + offset, len);
+}
+
+static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
+{
+        bkd->compress_bytes_in = toi_compress_bytes_in;
+        bkd->compress_bytes_out = toi_compress_bytes_out;
+}
+
+static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd)
+{
+        toi_compress_bytes_in = bkd->compress_bytes_in;
+        toi_compress_bytes_out = bkd->compress_bytes_out;
+}
+
+/*
+ * toi_expected_compression_ratio
+ *
+ * Description:        Returns the expected ratio between data passed into this module
+ *                 and the amount of data output when writing.
+ * Returns:        100 if the module is disabled. Otherwise the value set by the
+ *                 user via our sysfs entry.
+ */
+
+static int toi_compress_expected_ratio(void)
+{
+        if (!toi_compression_ops.enabled)
+                return 100;
+        else
+                return 100 - toi_expected_compression;
+}
+
+/*
+ * data for our sysfs entries.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression,
+                        0, 99, 0, NULL),
+        SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0,
+                        NULL),
+        SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL),
+};
+
+/*
+ * Ops structure.
+ */
+static struct toi_module_ops toi_compression_ops = {
+        .type                        = FILTER_MODULE,
+        .name                        = "compression",
+        .directory                = "compression",
+        .module                        = THIS_MODULE,
+        .initialise                = toi_compress_init,
+        .memory_needed                 = toi_compress_memory_needed,
+        .print_debug_info        = toi_compress_print_debug_stats,
+        .save_config_info        = toi_compress_save_config_info,
+        .load_config_info        = toi_compress_load_config_info,
+        .storage_needed                = toi_compress_storage_needed,
+        .expected_compression        = toi_compress_expected_ratio,
+
+        .pre_atomic_restore        = toi_compress_pre_atomic_restore,
+        .post_atomic_restore        = toi_compress_post_atomic_restore,
+
+        .rw_init                = toi_compress_rw_init,
+        .rw_cleanup                = toi_compress_rw_cleanup,
+
+        .write_page                = toi_compress_write_page,
+        .read_page                = toi_compress_read_page,
+
+        .sysfs_data                = sysfs_params,
+        .num_sysfs_entries        = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+
+static __init int toi_compress_load(void)
+{
+        return toi_register_module(&toi_compression_ops);
+}
+
+late_initcall(toi_compress_load);
diff --git b/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c
new file mode 100644
index 0000000..eb62791
--- /dev/null
+++ b/kernel/power/tuxonice_copy_before_write.c
@@ -0,0 +1,240 @@
+/*
+ * kernel/power/tuxonice_copy_before_write.c
+ *
+ * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines (apart from the fault handling code) to deal with allocating memory
+ * for copying pages before they are modified, restoring the contents and getting
+ * the contents written to disk.
+ */
+
+#include <linux/percpu-defs.h>
+#include <linux/sched.h>
+#include <linux/tuxonice.h>
+#include "tuxonice_alloc.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice.h"
+
+DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states);
+#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw))
+#define toi_cbw_pool_size 100
+
+static void _toi_free_cbw_data(struct toi_cbw_state *state)
+{
+    struct toi_cbw *page_ptr, *ptr, *next;
+
+    page_ptr = ptr = state->first;
+
+    while(ptr) {
+        next = ptr->next;
+
+        if (ptr->virt) {
+            toi__free_page(40, virt_to_page(ptr->virt));
+        }
+        if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) {
+            /* Must be on a new page - free the previous one. */
+            toi__free_page(40, virt_to_page(page_ptr));
+            page_ptr = ptr;
+        }
+        ptr = next;
+    }
+
+    if (page_ptr) {
+        toi__free_page(40, virt_to_page(page_ptr));
+    }
+
+    state->first = state->next = state->last = NULL;
+    state->size = 0;
+}
+
+void toi_free_cbw_data(void)
+{
+    int i;
+
+    for_each_online_cpu(i) {
+        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
+
+        if (!state->first)
+            continue;
+
+        state->enabled = 0;
+
+        while (state->active) {
+            schedule();
+        }
+
+        _toi_free_cbw_data(state);
+    }
+}
+
+static int _toi_allocate_cbw_data(struct toi_cbw_state *state)
+{
+    while(state->size < toi_cbw_pool_size) {
+        int i;
+        struct toi_cbw *ptr;
+
+        ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL);
+
+        if (!ptr) {
+            return -ENOMEM;
+        }
+
+        if (!state->first) {
+            state->first = state->next = state->last = ptr;
+        }
+
+        for (i = 0; i < CBWS_PER_PAGE; i++) {
+            struct toi_cbw *cbw = &ptr[i];
+
+            cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL);
+            if (!cbw->virt) {
+                state->size += i;
+                printk("Out of memory allocating CBW pages.\n");
+                return -ENOMEM;
+            }
+
+            if (cbw == state->first)
+                continue;
+
+            state->last->next = cbw;
+            state->last = cbw;
+        }
+
+        state->size += CBWS_PER_PAGE;
+    }
+
+    state->enabled = 1;
+
+    return 0;
+}
+
+
+int toi_allocate_cbw_data(void)
+{
+    int i, result;
+
+    for_each_online_cpu(i) {
+        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
+
+        result = _toi_allocate_cbw_data(state);
+
+        if (result)
+            return result;
+    }
+
+    return 0;
+}
+
+void toi_cbw_restore(void)
+{
+    if (!toi_keeping_image)
+        return;
+
+}
+
+void toi_cbw_write(void)
+{
+    if (!toi_keeping_image)
+        return;
+
+}
+
+/**
+ * toi_cbw_test_read - Test copy before write on one page
+ *
+ * Allocate copy before write buffers, then make one page only copy-before-write
+ * and attempt to write to it. We should then be able to retrieve the original
+ * version from the cbw buffer and the modified version from the page itself.
+ */
+static int toi_cbw_test_read(const char *buffer, int count)
+{
+    unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL);
+    char *original = "Original contents";
+    char *modified = "Modified material";
+    struct page *page = virt_to_page(virt);
+    int i, len = 0, found = 0, pfn = page_to_pfn(page);
+
+    if (!page) {
+        printk("toi_cbw_test_read: Unable to allocate a page for testing.\n");
+        return -ENOMEM;
+    }
+
+    memcpy((char *) virt, original, strlen(original));
+
+    if (toi_allocate_cbw_data()) {
+        printk("toi_cbw_test_read: Unable to allocate cbw data.\n");
+        return -ENOMEM;
+    }
+
+    toi_reset_dirtiness_one(pfn, 0);
+
+    SetPageTOI_CBW(page);
+
+    memcpy((char *) virt, modified, strlen(modified));
+
+    if (strncmp((char *) virt, modified, strlen(modified))) {
+        len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n");
+    }
+
+    for_each_online_cpu(i) {
+        struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
+        struct toi_cbw *ptr = state->first, *last_ptr = ptr;
+
+        if (!found) {
+            while (ptr) {
+                if (ptr->pfn == pfn) {
+                    found = 1;
+                    if (strncmp(ptr->virt, original, strlen(original))) {
+                        len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n");
+                    } else {
+                        len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n");
+                    }
+                    break;
+                }
+
+                last_ptr = ptr;
+                ptr = ptr->next;
+            }
+        }
+
+        if (!last_ptr)
+            len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i);
+    }
+
+    if (!found)
+        len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n");
+
+    toi_free_cbw_data();
+
+    return len;
+}
+
+/*
+ * This array contains entries that are automatically registered at
+ * boot. Modules and the console code register their own entries separately.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read,
+                        NULL, SYSFS_NEEDS_SM_FOR_READ, NULL),
+};
+
+static struct toi_module_ops toi_cbw_ops = {
+        .type                                        = MISC_HIDDEN_MODULE,
+        .name                                        = "copy_before_write debugging",
+        .directory                                = "cbw",
+        .module                                        = THIS_MODULE,
+        .early                                        = 1,
+
+        .sysfs_data                = sysfs_params,
+        .num_sysfs_entries        = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+int toi_cbw_init(void)
+{
+        int result = toi_register_module(&toi_cbw_ops);
+        return result;
+}
diff --git b/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c
new file mode 100644
index 0000000..522c836
--- /dev/null
+++ b/kernel/power/tuxonice_extent.c
@@ -0,0 +1,144 @@
+/*
+ * kernel/power/tuxonice_extent.c
+ *
+ * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * These functions encapsulate the manipulation of storage metadata.
+ */
+
+#include <linux/suspend.h>
+#include "tuxonice_modules.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_ui.h"
+#include "tuxonice.h"
+
+/**
+ * toi_get_extent - return a free extent
+ *
+ * May fail, returning NULL instead.
+ **/
+static struct hibernate_extent *toi_get_extent(void)
+{
+        return (struct hibernate_extent *) toi_kzalloc(2,
+                        sizeof(struct hibernate_extent), TOI_ATOMIC_GFP);
+}
+
+/**
+ * toi_put_extent_chain - free a chain of extents starting from value 'from'
+ * @chain:        Chain to free.
+ *
+ * Note that 'from' is an extent value, and may be part way through an extent.
+ * In this case, the extent should be truncated (if necessary) and following
+ * extents freed.
+ **/
+void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from)
+{
+        struct hibernate_extent *this;
+
+        this = chain->first;
+
+        while (this) {
+                struct hibernate_extent *next = this->next;
+
+                // Delete the whole extent?
+                if (this->start >= from) {
+                    chain->size -= (this->end - this->start + 1);
+                    if (chain->first == this)
+                        chain->first = next;
+                    if (chain->last_touched == this)
+                        chain->last_touched = NULL;
+                    if (chain->current_extent == this)
+                        chain->current_extent = NULL;
+                    toi_kfree(2, this, sizeof(*this));
+                    chain->num_extents--;
+                } else if (this->end >= from) {
+                    // Delete part of the extent
+                    chain->size -= (this->end - from + 1);
+                    this->start = from;
+                }
+                this = next;
+        }
+}
+
+/**
+ * toi_put_extent_chain - free a whole chain of extents
+ * @chain:        Chain to free.
+ **/
+void toi_put_extent_chain(struct hibernate_extent_chain *chain)
+{
+    toi_put_extent_chain_from(chain, 0);
+}
+
+/**
+ * toi_add_to_extent_chain - add an extent to an existing chain
+ * @chain:        Chain to which the extend should be added
+ * @start:        Start of the extent (first physical block)
+ * @end:        End of the extent (last physical block)
+ *
+ * The chain information is updated if the insertion is successful.
+ **/
+int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
+                unsigned long start, unsigned long end)
+{
+        struct hibernate_extent *new_ext = NULL, *cur_ext = NULL;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0,
+                "Adding extent %lu-%lu to chain %p.\n", start, end, chain);
+
+        /* Find the right place in the chain */
+        if (chain->last_touched && chain->last_touched->start < start)
+                cur_ext = chain->last_touched;
+        else if (chain->first && chain->first->start < start)
+                cur_ext = chain->first;
+
+        if (cur_ext) {
+                while (cur_ext->next && cur_ext->next->start < start)
+                        cur_ext = cur_ext->next;
+
+                if (cur_ext->end == (start - 1)) {
+                        struct hibernate_extent *next_ext = cur_ext->next;
+                        cur_ext->end = end;
+
+                        /* Merge with the following one? */
+                        if (next_ext && cur_ext->end + 1 == next_ext->start) {
+                                cur_ext->end = next_ext->end;
+                                cur_ext->next = next_ext->next;
+                                toi_kfree(2, next_ext, sizeof(*next_ext));
+                                chain->num_extents--;
+                        }
+
+                        chain->last_touched = cur_ext;
+                        chain->size += (end - start + 1);
+
+                        return 0;
+                }
+        }
+
+        new_ext = toi_get_extent();
+        if (!new_ext) {
+                printk(KERN_INFO "Error unable to append a new extent to the "
+                                "chain.\n");
+                return -ENOMEM;
+        }
+
+        chain->num_extents++;
+        chain->size += (end - start + 1);
+        new_ext->start = start;
+        new_ext->end = end;
+
+        chain->last_touched = new_ext;
+
+        if (cur_ext) {
+                new_ext->next = cur_ext->next;
+                cur_ext->next = new_ext;
+        } else {
+                if (chain->first)
+                        new_ext->next = chain->first;
+                chain->first = new_ext;
+        }
+
+        return 0;
+}
diff --git b/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h
new file mode 100644
index 0000000..aeccf1f
--- /dev/null
+++ b/kernel/power/tuxonice_extent.h
@@ -0,0 +1,45 @@
+/*
+ * kernel/power/tuxonice_extent.h
+ *
+ * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains declarations related to extents. Extents are
+ * TuxOnIce's method of storing some of the metadata for the image.
+ * See tuxonice_extent.c for more info.
+ *
+ */
+
+#include "tuxonice_modules.h"
+
+#ifndef EXTENT_H
+#define EXTENT_H
+
+struct hibernate_extent {
+        unsigned long start, end;
+        struct hibernate_extent *next;
+};
+
+struct hibernate_extent_chain {
+        unsigned long size; /* size of the chain ie sum (max-min+1) */
+        int num_extents;
+        struct hibernate_extent *first, *last_touched;
+        struct hibernate_extent *current_extent;
+        unsigned long current_offset;
+};
+
+/* Simplify iterating through all the values in an extent chain */
+#define toi_extent_for_each(extent_chain, extentpointer, value) \
+if ((extent_chain)->first) \
+        for ((extentpointer) = (extent_chain)->first, (value) = \
+                        (extentpointer)->start; \
+             ((extentpointer) && ((extentpointer)->next || (value) <= \
+                                 (extentpointer)->end)); \
+             (((value) == (extentpointer)->end) ? \
+                ((extentpointer) = (extentpointer)->next, (value) = \
+                 ((extentpointer) ? (extentpointer)->start : 0)) : \
+                        (value)++))
+
+extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from);
+#endif
diff --git b/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
new file mode 100644
index 0000000..baf1912
--- /dev/null
+++ b/kernel/power/tuxonice_file.c
@@ -0,0 +1,484 @@
+/*
+ * kernel/power/tuxonice_file.c
+ *
+ * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file encapsulates functions for usage of a simple file as a
+ * backing store. It is based upon the swapallocator, and shares the
+ * same basic working. Here, though, we have nothing to do with
+ * swapspace, and only one device to worry about.
+ *
+ * The user can just
+ *
+ * echo TuxOnIce > /path/to/my_file
+ *
+ * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
+ *
+ * and
+ *
+ * echo /path/to/my_file > /sys/power/tuxonice/file/target
+ *
+ * then put what they find in /sys/power/tuxonice/resume
+ * as their resume= parameter in lilo.conf (and rerun lilo if using it).
+ *
+ * Having done this, they're ready to hibernate and resume.
+ *
+ * TODO:
+ * - File resizing.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/fs_uuid.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_bio.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_io.h"
+
+#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
+
+static struct toi_module_ops toi_fileops;
+
+static struct file *target_file;
+static struct block_device *toi_file_target_bdev;
+static unsigned long pages_available, pages_allocated;
+static char toi_file_target[256];
+static struct inode *target_inode;
+static int file_target_priority;
+static int used_devt;
+static int target_claim;
+static dev_t toi_file_dev_t;
+static int sig_page_index;
+
+/* For test_toi_file_target */
+static struct toi_bdev_info *file_chain;
+
+static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num)
+{
+        int j;
+        sector_t last = 0;
+
+        for (j = 0; j < dev_info->blocks_per_page; j++) {
+                sector_t this = bmap(target_inode,
+                                page_num * dev_info->blocks_per_page + j);
+
+                if (!this || (last && (last + 1) != this))
+                        break;
+
+                last = this;
+        }
+
+        return j == dev_info->blocks_per_page;
+}
+
+static unsigned long get_usable_pages(struct toi_bdev_info *dev_info)
+{
+        unsigned long result = 0;
+        struct block_device *bdev = dev_info->bdev;
+        int i;
+
+        switch (target_inode->i_mode & S_IFMT) {
+        case S_IFSOCK:
+        case S_IFCHR:
+        case S_IFIFO: /* Socket, Char, Fifo */
+                return -1;
+        case S_IFREG: /* Regular file: current size - holes + free
+                         space on part */
+                for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) {
+                        if (has_contiguous_blocks(dev_info, i))
+                                result++;
+                }
+                break;
+        case S_IFBLK: /* Block device */
+                if (!bdev->bd_disk) {
+                        toi_message(TOI_IO, TOI_VERBOSE, 0,
+                                        "bdev->bd_disk null.");
+                        return 0;
+                }
+
+                result = (bdev->bd_part ?
+                        bdev->bd_part->nr_sects :
+                        get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9);
+        }
+
+
+        return result;
+}
+
+static int toi_file_register_storage(void)
+{
+        struct toi_bdev_info *devinfo;
+        int result = 0;
+        struct fs_info *fs_info;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage.");
+        if (!strlen(toi_file_target)) {
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: "
+                                "No target filename set.");
+                return 0;
+        }
+
+        target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0);
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.",
+                        toi_file_target, target_file);
+
+        if (IS_ERR(target_file) || !target_file) {
+                target_file = NULL;
+                toi_file_dev_t = name_to_dev_t(toi_file_target);
+                if (!toi_file_dev_t) {
+                        struct kstat stat;
+                        int error = vfs_stat(toi_file_target, &stat);
+                        printk(KERN_INFO "Open file %s returned %p and "
+                                        "name_to_devt failed.\n",
+                                        toi_file_target, target_file);
+                        if (error) {
+                                printk(KERN_INFO "Stating the file also failed."
+                                        " Nothing more we can do.\n");
+                                return 0;
+                        } else
+                                toi_file_dev_t = stat.rdev;
+                }
+
+                toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t);
+                if (IS_ERR(toi_file_target_bdev)) {
+                        printk(KERN_INFO "Got a dev_num (%lx) but failed to "
+                                        "open it.\n",
+                                        (unsigned long) toi_file_dev_t);
+                        toi_file_target_bdev = NULL;
+                        return 0;
+                }
+                used_devt = 1;
+                target_inode = toi_file_target_bdev->bd_inode;
+        } else
+                target_inode = target_file->f_mapping->host;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target.");
+        if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
+            S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
+                printk(KERN_INFO "File support works with regular files,"
+                                " character files and block devices.\n");
+                /* Cleanup routine will undo the above */
+                return 0;
+        }
+
+        if (!used_devt) {
+                if (S_ISBLK(target_inode->i_mode)) {
+                        toi_file_target_bdev = I_BDEV(target_inode);
+                        if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE |
+                                                FMODE_READ, NULL))
+                                target_claim = 1;
+                } else
+                        toi_file_target_bdev = target_inode->i_sb->s_bdev;
+                if (!toi_file_target_bdev) {
+                        printk(KERN_INFO "%s is not a valid file allocator "
+                                        "target.\n", toi_file_target);
+                        return 0;
+                }
+                toi_file_dev_t = toi_file_target_bdev->bd_dev;
+        }
+
+        devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC);
+        if (!devinfo) {
+                printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n");
+                return -ENOMEM;
+        }
+
+        devinfo->bdev = toi_file_target_bdev;
+        devinfo->allocator = &toi_fileops;
+        devinfo->allocator_index = 0;
+
+        fs_info = fs_info_from_block_dev(toi_file_target_bdev);
+        if (fs_info && !IS_ERR(fs_info)) {
+                memcpy(devinfo->uuid, &fs_info->uuid, 16);
+                free_fs_info(fs_info);
+        } else
+                result = (int) PTR_ERR(fs_info);
+
+        /* Unlike swap code, only complain if fs_info_from_block_dev returned
+         * -ENOMEM. The 'file' might be a full partition, so might validly not
+         * have an identifiable type, UUID etc.
+         */
+        if (result)
+                printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n",
+                                result);
+        devinfo->dev_t = toi_file_dev_t;
+        devinfo->prio = file_target_priority;
+        devinfo->bmap_shift = target_inode->i_blkbits - 9;
+        devinfo->blocks_per_page =
+                (1 << (PAGE_SHIFT - target_inode->i_blkbits));
+        sprintf(devinfo->name, "file %s", toi_file_target);
+        file_chain = devinfo;
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap "
+                        "shift is %d. Blocks per page %d.",
+                        devinfo->dev_t, devinfo->prio, devinfo->bmap_shift,
+                        devinfo->blocks_per_page);
+
+        /* Keep one aside for the signature */
+        pages_available = get_usable_pages(devinfo) - 1;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu "
+                        "pages.", pages_available);
+
+        toi_bio_ops.register_storage(devinfo);
+        return 0;
+}
+
+static unsigned long toi_file_storage_available(void)
+{
+        return pages_available;
+}
+
+static int toi_file_allocate_storage(struct toi_bdev_info *chain,
+                unsigned long request)
+{
+        unsigned long available = pages_available - pages_allocated;
+        unsigned long to_add = min(available, request);
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated "
+                "is %lu. Allocating %lu pages from file.",
+                pages_available, pages_allocated, to_add);
+        pages_allocated += to_add;
+
+        return to_add;
+}
+
+/**
+ * __populate_block_list - add an extent to the chain
+ * @min:        Start of the extent (first physical block = sector)
+ * @max:        End of the extent (last physical block = sector)
+ *
+ * If TOI_TEST_BIO is set, print a debug message, outputting the min and max
+ * fs block numbers.
+ **/
+static int __populate_block_list(struct toi_bdev_info *chain, int min, int max)
+{
+        if (test_action_state(TOI_TEST_BIO))
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.",
+                        min << chain->bmap_shift,
+                        ((max + 1) << chain->bmap_shift) - 1);
+
+        return toi_add_to_extent_chain(&chain->blocks, min, max);
+}
+
+static int get_main_pool_phys_params(struct toi_bdev_info *chain)
+{
+        int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0;
+        unsigned long pages_mapped = 0;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks.");
+
+        if (chain->blocks.first)
+                toi_put_extent_chain(&chain->blocks);
+
+        if (!target_is_normal_file()) {
+                result = (pages_available > 0) ?
+                        __populate_block_list(chain, chain->blocks_per_page,
+                                (pages_allocated + 1) *
+                                chain->blocks_per_page - 1) : 0;
+                return result;
+        }
+
+        /*
+         * FIXME: We are assuming the first page is contiguous. Is that
+         * assumption always right?
+         */
+
+        for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
+                sector_t new_sector;
+
+                if (!has_contiguous_blocks(chain, i))
+                        continue;
+
+                if (!have_sig_page) {
+                        have_sig_page = 1;
+                        sig_page_index = i;
+                        continue;
+                }
+
+                pages_mapped++;
+
+                /* Ignore first page - it has the header */
+                if (pages_mapped == 1)
+                        continue;
+
+                new_sector = bmap(target_inode, (i * chain->blocks_per_page));
+
+                /*
+                 * I'd love to be able to fill in holes and resize
+                 * files, but not yet...
+                 */
+
+                if (new_sector == extent_max + 1)
+                        extent_max += chain->blocks_per_page;
+                else {
+                        if (extent_min > -1) {
+                                result = __populate_block_list(chain,
+                                                extent_min, extent_max);
+                                if (result)
+                                        return result;
+                        }
+
+                        extent_min = new_sector;
+                        extent_max = extent_min +
+                                chain->blocks_per_page - 1;
+                }
+
+                if (pages_mapped == pages_allocated)
+                        break;
+        }
+
+        if (extent_min > -1) {
+                result = __populate_block_list(chain, extent_min, extent_max);
+                if (result)
+                        return result;
+        }
+
+        return 0;
+}
+
+static void toi_file_free_storage(struct toi_bdev_info *chain)
+{
+        pages_allocated = 0;
+        file_chain = NULL;
+}
+
+/**
+ * toi_file_print_debug_stats - print debug info
+ * @buffer:        Buffer to data to populate
+ * @size:        Size of the buffer
+ **/
+static int toi_file_print_debug_stats(char *buffer, int size)
+{
+        int len = scnprintf(buffer, size, "- File Allocator active.\n");
+
+        len += scnprintf(buffer+len, size-len, "  Storage available for "
+                        "image: %lu pages.\n", pages_available);
+
+        return len;
+}
+
+static void toi_file_cleanup(int finishing_cycle)
+{
+        if (toi_file_target_bdev) {
+                if (target_claim) {
+                        blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ);
+                        target_claim = 0;
+                }
+
+                if (used_devt) {
+                        blkdev_put(toi_file_target_bdev,
+                                        FMODE_READ | FMODE_NDELAY);
+                        used_devt = 0;
+                }
+                toi_file_target_bdev = NULL;
+                target_inode = NULL;
+        }
+
+        if (target_file) {
+                filp_close(target_file, NULL);
+                target_file = NULL;
+        }
+
+        pages_available = 0;
+}
+
+/**
+ * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target
+ *
+ * Test wheter the target file is valid for hibernating.
+ **/
+static void test_toi_file_target(void)
+{
+        int result = toi_file_register_storage();
+        sector_t sector;
+        char buf[50];
+        struct fs_info *fs_info;
+
+        if (result || !file_chain)
+                return;
+
+        /* This doesn't mean we're in business. Is any storage available? */
+        if (!pages_available)
+                goto out;
+
+        toi_file_allocate_storage(file_chain, 1);
+        result = get_main_pool_phys_params(file_chain);
+        if (result)
+                goto out;
+
+
+        sector = bmap(target_inode, sig_page_index *
+                        file_chain->blocks_per_page) << file_chain->bmap_shift;
+
+        /* Use the uuid, or the dev_t if that fails */
+        fs_info = fs_info_from_block_dev(toi_file_target_bdev);
+        if (!fs_info || IS_ERR(fs_info)) {
+                bdevname(toi_file_target_bdev, buf);
+                sprintf(resume_file, "/dev/%s:%llu", buf,
+                                (unsigned long long) sector);
+        } else {
+                int i;
+                hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0);
+
+                /* Remove the spaces */
+                for (i = 1; i < 16; i++) {
+                        buf[2 * i] = buf[3 * i];
+                        buf[2 * i + 1] = buf[3 * i + 1];
+                }
+                buf[32] = 0;
+                sprintf(resume_file, "UUID=%s:0x%llx", buf,
+                                (unsigned long long) sector);
+                free_fs_info(fs_info);
+        }
+
+        toi_attempt_to_parse_resume_device(0);
+out:
+        toi_file_free_storage(file_chain);
+        toi_bio_ops.free_storage();
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256,
+                SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target),
+        SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL),
+        SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095,
+                        4096, 0, NULL),
+};
+
+static struct toi_bio_allocator_ops toi_bio_fileops = {
+        .register_storage                        = toi_file_register_storage,
+        .storage_available                        = toi_file_storage_available,
+        .allocate_storage                        = toi_file_allocate_storage,
+        .bmap                                        = get_main_pool_phys_params,
+        .free_storage                                = toi_file_free_storage,
+};
+
+static struct toi_module_ops toi_fileops = {
+        .type                                        = BIO_ALLOCATOR_MODULE,
+        .name                                        = "file storage",
+        .directory                                = "file",
+        .module                                        = THIS_MODULE,
+        .print_debug_info                        = toi_file_print_debug_stats,
+        .cleanup                                = toi_file_cleanup,
+        .bio_allocator_ops                        = &toi_bio_fileops,
+
+        .sysfs_data                = sysfs_params,
+        .num_sysfs_entries        = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+static __init int toi_file_load(void)
+{
+        return toi_register_module(&toi_fileops);
+}
+
+late_initcall(toi_file_load);
diff --git b/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
new file mode 100644
index 0000000..1ef1f22
--- /dev/null
+++ b/kernel/power/tuxonice_highlevel.c
@@ -0,0 +1,1416 @@
+/*
+ * kernel/power/tuxonice_highlevel.c
+ */
+/** \mainpage TuxOnIce.
+ *
+ * TuxOnIce provides support for saving and restoring an image of
+ * system memory to an arbitrary storage device, either on the local computer,
+ * or across some network. The support is entirely OS based, so TuxOnIce
+ * works without requiring BIOS, APM or ACPI support. The vast majority of the
+ * code is also architecture independant, so it should be very easy to port
+ * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem
+ * and preemption. Initramfses and initrds are also supported.
+ *
+ * TuxOnIce uses a modular design, in which the method of storing the image is
+ * completely abstracted from the core code, as are transformations on the data
+ * such as compression and/or encryption (multiple 'modules' can be used to
+ * provide arbitrary combinations of functionality). The user interface is also
+ * modular, so that arbitrarily simple or complex interfaces can be used to
+ * provide anything from debugging information through to eye candy.
+ *
+ * \section Copyright
+ *
+ * TuxOnIce is released under the GPLv2.
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)<BR>
+ *
+ * \section Credits
+ *
+ * Nigel would like to thank the following people for their work:
+ *
+ * Bernard Blackham <bernard@blackham.com.au><BR>
+ * Web page & Wiki administration, some coding. A person without whom
+ * TuxOnIce would not be where it is.
+ *
+ * Michael Frank <mhf@linuxmail.org><BR>
+ * Extensive testing and help with improving stability. I was constantly
+ * amazed by the quality and quantity of Michael's help.
+ *
+ * Pavel Machek <pavel@ucw.cz><BR>
+ * Modifications, defectiveness pointing, being with Gabor at the very
+ * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and
+ * 2.5.17. Even though Pavel and I disagree on the direction suspend to
+ * disk should take, I appreciate the valuable work he did in helping Gabor
+ * get the concept working.
+ *
+ * ..and of course the myriads of TuxOnIce users who have helped diagnose
+ * and fix bugs, made suggestions on how to improve the code, proofread
+ * documentation, and donated time and money.
+ *
+ * Thanks also to corporate sponsors:
+ *
+ * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!).
+ *
+ * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who
+ * allowed him to work on TuxOnIce and PM related issues on company time.
+ *
+ * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct
+ * 2003 to Jan 2004.
+ *
+ * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
+ * maintenance of SMP and Highmem support.
+ *
+ * <B>OSDL.</B> Provided access to various hardware configurations, make
+ * occasional small donations to the project.
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/freezer.h>
+#include <generated/utsrelease.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
+#include <linux/writeback.h>
+#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */
+#include <linux/bio.h>
+#include <linux/kgdb.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_power_off.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_checksum.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_atomic_copy.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_cluster.h"
+
+/*! Pageset metadata. */
+struct pagedir pagedir2 = {2};
+
+static mm_segment_t oldfs;
+static DEFINE_MUTEX(tuxonice_in_use);
+static int block_dump_save;
+
+int toi_trace_index;
+
+/* Binary signature if an image is present */
+char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c";
+
+unsigned long boot_kernel_data_buffer;
+
+static char *result_strings[] = {
+        "Hibernation was aborted",
+        "The user requested that we cancel the hibernation",
+        "No storage was available",
+        "Insufficient storage was available",
+        "Freezing filesystems and/or tasks failed",
+        "A pre-existing image was used",
+        "We would free memory, but image size limit doesn't allow this",
+        "Unable to free enough memory to hibernate",
+        "Unable to obtain the Power Management Semaphore",
+        "A device suspend/resume returned an error",
+        "A system device suspend/resume returned an error",
+        "The extra pages allowance is too small",
+        "We were unable to successfully prepare an image",
+        "TuxOnIce module initialisation failed",
+        "TuxOnIce module cleanup failed",
+        "I/O errors were encountered",
+        "Ran out of memory",
+        "An error was encountered while reading the image",
+        "Platform preparation failed",
+        "CPU Hotplugging failed",
+        "Architecture specific preparation failed",
+        "Pages needed resaving, but we were told to abort if this happens",
+        "We can't hibernate at the moment (invalid resume= or filewriter "
+                "target?)",
+        "A hibernation preparation notifier chain member cancelled the "
+                "hibernation",
+        "Pre-snapshot preparation failed",
+        "Pre-restore preparation failed",
+        "Failed to disable usermode helpers",
+        "Can't resume from alternate image",
+        "Header reservation too small",
+        "Device Power Management Preparation failed",
+};
+
+/**
+ * toi_finish_anything - cleanup after doing anything
+ * @hibernate_or_resume:        Whether finishing a cycle or attempt at
+ *                                resuming.
+ *
+ * This is our basic clean-up routine, matching start_anything below. We
+ * call cleanup routines, drop module references and restore process fs and
+ * cpus allowed masks, together with the global block_dump variable's value.
+ **/
+void toi_finish_anything(int hibernate_or_resume)
+{
+        toi_running = 0;
+        toi_cleanup_modules(hibernate_or_resume);
+        toi_put_modules();
+        if (hibernate_or_resume) {
+                block_dump = block_dump_save;
+                set_cpus_allowed_ptr(current, cpu_all_mask);
+                toi_alloc_print_debug_stats();
+                atomic_inc(&snapshot_device_available);
+                unlock_system_sleep();
+                release_super_lock();
+        }
+
+        set_fs(oldfs);
+        mutex_unlock(&tuxonice_in_use);
+}
+
+/**
+ * toi_start_anything - basic initialisation for TuxOnIce
+ * @toi_or_resume:        Whether starting a cycle or attempt at resuming.
+ *
+ * Our basic initialisation routine. Take references on modules, use the
+ * kernel segment, recheck resume= if no active allocator is set, initialise
+ * modules, save and reset block_dump and ensure we're running on CPU0.
+ **/
+int toi_start_anything(int hibernate_or_resume)
+{
+        mutex_lock(&tuxonice_in_use);
+
+        oldfs = get_fs();
+        set_fs(KERNEL_DS);
+
+        toi_trace_index = 0;
+
+        if (hibernate_or_resume) {
+            take_super_lock();
+            lock_system_sleep();
+
+                if (!atomic_add_unless(&snapshot_device_available, -1, 0))
+                        goto snapshotdevice_unavailable;
+        }
+
+        if (hibernate_or_resume == SYSFS_HIBERNATE)
+                toi_print_modules();
+
+        if (toi_get_modules()) {
+                printk(KERN_INFO "TuxOnIce: Get modules failed!\n");
+                goto prehibernate_err;
+        }
+
+        if (hibernate_or_resume) {
+                block_dump_save = block_dump;
+                block_dump = 0;
+                set_cpus_allowed_ptr(current,
+                                cpumask_of(cpumask_first(cpu_online_mask)));
+        }
+
+        if (toi_initialise_modules_early(hibernate_or_resume))
+                goto early_init_err;
+
+        if (!toiActiveAllocator)
+                toi_attempt_to_parse_resume_device(!hibernate_or_resume);
+
+        if (!toi_initialise_modules_late(hibernate_or_resume)) {
+            toi_running = 1; /* For the swsusp code we use :< */
+            return 0;
+        }
+
+        toi_cleanup_modules(hibernate_or_resume);
+early_init_err:
+        if (hibernate_or_resume) {
+                block_dump_save = block_dump;
+                set_cpus_allowed_ptr(current, cpu_all_mask);
+        }
+        toi_put_modules();
+prehibernate_err:
+        if (hibernate_or_resume)
+                atomic_inc(&snapshot_device_available);
+snapshotdevice_unavailable:
+        if (hibernate_or_resume)
+                mutex_unlock(&pm_mutex);
+        release_super_lock();
+        set_fs(oldfs);
+        mutex_unlock(&tuxonice_in_use);
+        return -EBUSY;
+}
+
+/*
+ * Nosave page tracking.
+ *
+ * Here rather than in prepare_image because we want to do it once only at the
+ * start of a cycle.
+ */
+
+/**
+ * mark_nosave_pages - set up our Nosave bitmap
+ *
+ * Build a bitmap of Nosave pages from the list. The bitmap allows faster
+ * use when preparing the image.
+ **/
+static void mark_nosave_pages(void)
+{
+        struct nosave_region *region;
+
+        list_for_each_entry(region, &nosave_regions, list) {
+                unsigned long pfn;
+
+                for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
+                        if (pfn_valid(pfn)) {
+                                SetPageNosave(pfn_to_page(pfn));
+                        }
+        }
+}
+
+/**
+ * allocate_bitmaps - allocate bitmaps used to record page states
+ *
+ * Allocate the bitmaps we use to record the various TuxOnIce related
+ * page states.
+ **/
+static int allocate_bitmaps(void)
+{
+        if (toi_alloc_bitmap(&pageset1_map) ||
+            toi_alloc_bitmap(&pageset1_copy_map) ||
+            toi_alloc_bitmap(&pageset2_map) ||
+            toi_alloc_bitmap(&io_map) ||
+            toi_alloc_bitmap(&nosave_map) ||
+            toi_alloc_bitmap(&free_map) ||
+            toi_alloc_bitmap(&compare_map) ||
+            toi_alloc_bitmap(&page_resave_map))
+                return 1;
+
+        return 0;
+}
+
+/**
+ * free_bitmaps - free the bitmaps used to record page states
+ *
+ * Free the bitmaps allocated above. It is not an error to call
+ * memory_bm_free on a bitmap that isn't currently allocated.
+ **/
+static void free_bitmaps(void)
+{
+        toi_free_bitmap(&pageset1_map);
+        toi_free_bitmap(&pageset1_copy_map);
+        toi_free_bitmap(&pageset2_map);
+        toi_free_bitmap(&io_map);
+        toi_free_bitmap(&nosave_map);
+        toi_free_bitmap(&free_map);
+        toi_free_bitmap(&compare_map);
+        toi_free_bitmap(&page_resave_map);
+}
+
+/**
+ * io_MB_per_second - return the number of MB/s read or written
+ * @write:        Whether to return the speed at which we wrote.
+ *
+ * Calculate the number of megabytes per second that were read or written.
+ **/
+static int io_MB_per_second(int write)
+{
+        return (toi_bkd.toi_io_time[write][1]) ?
+                MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ /
+                toi_bkd.toi_io_time[write][1] : 0;
+}
+
+#define SNPRINTF(a...)         do { len += scnprintf(((char *) buffer) + len, \
+                count - len - 1, ## a); } while (0)
+
+/**
+ * get_debug_info - fill a buffer with debugging information
+ * @buffer:        The buffer to be filled.
+ * @count:        The size of the buffer, in bytes.
+ *
+ * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
+ * either printk or return via sysfs.
+ **/
+static int get_toi_debug_info(const char *buffer, int count)
+{
+        int len = 0, i, first_result = 1;
+
+        SNPRINTF("TuxOnIce debugging info:\n");
+        SNPRINTF("- TuxOnIce core  : " TOI_CORE_VERSION "\n");
+        SNPRINTF("- Kernel Version : " UTS_RELEASE "\n");
+        SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
+        SNPRINTF("- Attempt number : %d\n", nr_hibernates);
+        SNPRINTF("- Parameters     : %ld %ld %ld %d %ld %ld\n",
+                        toi_result,
+                        toi_bkd.toi_action,
+                        toi_bkd.toi_debug_state,
+                        toi_bkd.toi_default_console_level,
+                        image_size_limit,
+                        toi_poweroff_method);
+        SNPRINTF("- Overall expected compression percentage: %d.\n",
+                        100 - toi_expected_compression_ratio());
+        len += toi_print_module_debug_info(((char *) buffer) + len,
+                        count - len - 1);
+        if (toi_bkd.toi_io_time[0][1]) {
+                if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
+                        SNPRINTF("- I/O speed: Write %ld KB/s",
+                          (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
+                          toi_bkd.toi_io_time[0][1]));
+                        if (toi_bkd.toi_io_time[1][1])
+                                SNPRINTF(", Read %ld KB/s",
+                                  (KB((unsigned long)
+                                      toi_bkd.toi_io_time[1][0]) * HZ /
+                                  toi_bkd.toi_io_time[1][1]));
+                } else {
+                        SNPRINTF("- I/O speed: Write %ld MB/s",
+                         (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
+                          toi_bkd.toi_io_time[0][1]));
+                        if (toi_bkd.toi_io_time[1][1])
+                                SNPRINTF(", Read %ld MB/s",
+                                 (MB((unsigned long)
+                                     toi_bkd.toi_io_time[1][0]) * HZ /
+                                  toi_bkd.toi_io_time[1][1]));
+                }
+                SNPRINTF(".\n");
+        } else
+                SNPRINTF("- No I/O speed stats available.\n");
+        SNPRINTF("- Extra pages    : %lu used/%lu.\n",
+                        extra_pd1_pages_used, extra_pd1_pages_allowance);
+
+        for (i = 0; i < TOI_NUM_RESULT_STATES; i++)
+                if (test_result_state(i)) {
+                        SNPRINTF("%s: %s.\n", first_result ?
+                                        "- Result         " :
+                                        "                 ",
+                                        result_strings[i]);
+                        first_result = 0;
+                }
+        if (first_result)
+                SNPRINTF("- Result         : %s.\n", nr_hibernates ?
+                        "Succeeded" :
+                        "No hibernation attempts so far");
+        return len;
+}
+
+#ifdef CONFIG_TOI_INCREMENTAL
+/**
+ * get_toi_page_state - fill a buffer with page state information
+ * @buffer:        The buffer to be filled.
+ * @count:        The size of the buffer, in bytes.
+ *
+ * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
+ * either printk or return via sysfs.
+ **/
+static int get_toi_page_state(const char *buffer, int count)
+{
+    int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0;
+    int len = 0;
+    struct zone *zone;
+    int allocated_bitmaps = 0;
+
+    set_cpus_allowed_ptr(current,
+            cpumask_of(cpumask_first(cpu_online_mask)));
+
+    if (!free_map) {
+        BUG_ON(toi_alloc_bitmap(&free_map));
+        allocated_bitmaps = 1;
+    }
+
+    toi_generate_free_page_map();
+
+    for_each_populated_zone(zone) {
+        unsigned long loop;
+
+        total += zone->spanned_pages;
+
+        for (loop = 0; loop < zone->spanned_pages; loop++) {
+            unsigned long pfn = zone->zone_start_pfn + loop;
+            struct page *page;
+            int chunk_size;
+
+            if (!pfn_valid(pfn)) {
+                continue;
+            }
+
+            chunk_size = toi_size_of_free_region(zone, pfn);
+            if (chunk_size) {
+                /*
+                 * If the page gets allocated, it will be need
+                 * saving in an image.
+                 * Don't bother with explicitly removing any
+                 * RO protection applied below.
+                 * We'll SetPageTOI_Dirty(page) if/when it
+                 * gets allocated.
+                 */
+                free += chunk_size;
+                loop += chunk_size - 1;
+                continue;
+            }
+
+            page = pfn_to_page(pfn);
+
+            if (PageTOI_Untracked(page)) {
+                untracked++;
+            } else if (PageTOI_RO(page)) {
+                ro++;
+            } else if (PageTOI_Dirty(page)) {
+                dirty++;
+            } else {
+                printk("Page %ld state 'other'.\n", pfn);
+                other++;
+            }
+        }
+    }
+
+    if (allocated_bitmaps) {
+        toi_free_bitmap(&free_map);
+    }
+
+    set_cpus_allowed_ptr(current, cpu_all_mask);
+
+    SNPRINTF("TuxOnIce page breakdown:\n");
+    SNPRINTF("- Free           : %d\n", free);
+    SNPRINTF("- Untracked      : %d\n", untracked);
+    SNPRINTF("- Read only      : %d\n", ro);
+    SNPRINTF("- Dirty          : %d\n", dirty);
+    SNPRINTF("- Other          : %d\n", other);
+    SNPRINTF("- Invalid        : %d\n", invalid);
+    SNPRINTF("- Total          : %d\n", total);
+    return len;
+}
+#endif
+
+/**
+ * do_cleanup - cleanup after attempting to hibernate or resume
+ * @get_debug_info:        Whether to allocate and return debugging info.
+ *
+ * Cleanup after attempting to hibernate or resume, possibly getting
+ * debugging info as we do so.
+ **/
+static void do_cleanup(int get_debug_info, int restarting)
+{
+        int i = 0;
+        char *buffer = NULL;
+
+        trap_non_toi_io = 0;
+
+        if (get_debug_info)
+                toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
+
+        free_checksum_pages();
+
+        toi_cbw_restore();
+        toi_free_cbw_data();
+
+        if (get_debug_info)
+                buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP);
+
+        if (buffer)
+                i = get_toi_debug_info(buffer, PAGE_SIZE);
+
+        toi_free_extra_pagedir_memory();
+
+        pagedir1.size = 0;
+        pagedir2.size = 0;
+        set_highmem_size(pagedir1, 0);
+        set_highmem_size(pagedir2, 0);
+
+        if (boot_kernel_data_buffer) {
+                if (!test_toi_state(TOI_BOOT_KERNEL))
+                        toi_free_page(37, boot_kernel_data_buffer);
+                boot_kernel_data_buffer = 0;
+        }
+
+        if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) {
+                unlock_device_hotplug();
+                clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
+        }
+
+        clear_toi_state(TOI_BOOT_KERNEL);
+        if (current->flags & PF_SUSPEND_TASK)
+                thaw_processes();
+
+        if (!restarting)
+                toi_stop_other_threads();
+
+        if (toi_keeping_image &&
+            !test_result_state(TOI_ABORTED)) {
+                toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
+                        "TuxOnIce: Not invalidating the image due "
+                        "to Keep Image or Incremental Image being enabled.");
+                set_result_state(TOI_KEPT_IMAGE);
+
+                /*
+                 * For an incremental image, free unused storage so
+                 * swap (if any) can be used for normal system operation,
+                 * if so desired.
+                 */
+
+                toiActiveAllocator->free_unused_storage();
+        } else
+                if (toiActiveAllocator)
+                        toiActiveAllocator->remove_image();
+
+        free_bitmaps();
+        usermodehelper_enable();
+
+        if (test_toi_state(TOI_NOTIFIERS_PREPARE)) {
+                pm_notifier_call_chain(PM_POST_HIBERNATION);
+                clear_toi_state(TOI_NOTIFIERS_PREPARE);
+        }
+
+        if (buffer && i) {
+                /* Printk can only handle 1023 bytes, including
+                 * its level mangling. */
+                for (i = 0; i < 3; i++)
+                        printk(KERN_ERR "%s", buffer + (1023 * i));
+                toi_free_page(20, (unsigned long) buffer);
+        }
+
+        if (!restarting)
+                toi_cleanup_console();
+
+        free_attention_list();
+
+        if (!restarting)
+                toi_deactivate_storage(0);
+
+        clear_toi_state(TOI_IGNORE_LOGLEVEL);
+        clear_toi_state(TOI_TRYING_TO_RESUME);
+        clear_toi_state(TOI_NOW_RESUMING);
+}
+
+/**
+ * check_still_keeping_image - we kept an image; check whether to reuse it.
+ *
+ * We enter this routine when we have kept an image. If the user has said they
+ * want to still keep it, all we need to do is powerdown. If powering down
+ * means hibernating to ram and the power doesn't run out, we'll return 1.
+ * If we do power off properly or the battery runs out, we'll resume via the
+ * normal paths.
+ *
+ * If the user has said they want to remove the previously kept image, we
+ * remove it, and return 0. We'll then store a new image.
+ **/
+static int check_still_keeping_image(void)
+{
+    if (toi_keeping_image) {
+        if (!test_action_state(TOI_INCREMENTAL_IMAGE)) {
+            printk(KERN_INFO "Image already stored: powering down "
+                    "immediately.");
+            do_toi_step(STEP_HIBERNATE_POWERDOWN);
+            return 1;
+        }
+        /**
+         * Incremental image - need to write new part.
+         * We detect that we're writing an incremental image by looking
+         * at test_result_state(TOI_KEPT_IMAGE)
+         **/
+        return 0;
+    }
+
+    printk(KERN_INFO "Invalidating previous image.\n");
+    toiActiveAllocator->remove_image();
+
+    return 0;
+}
+
+/**
+ * toi_init - prepare to hibernate to disk
+ *
+ * Initialise variables & data structures, in preparation for
+ * hibernating to disk.
+ **/
+static int toi_init(int restarting)
+{
+        int result, i, j;
+
+        toi_result = 0;
+
+        printk(KERN_INFO "Initiating a hibernation cycle.\n");
+
+        nr_hibernates++;
+
+        for (i = 0; i < 2; i++)
+                for (j = 0; j < 2; j++)
+                        toi_bkd.toi_io_time[i][j] = 0;
+
+        if (!test_toi_state(TOI_CAN_HIBERNATE) ||
+            allocate_bitmaps())
+                return 1;
+
+        mark_nosave_pages();
+
+        if (!restarting)
+                toi_prepare_console();
+
+        result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+        if (result) {
+                set_result_state(TOI_NOTIFIERS_PREPARE_FAILED);
+                return 1;
+        }
+        set_toi_state(TOI_NOTIFIERS_PREPARE);
+
+        if (!restarting) {
+                printk(KERN_ERR "Starting other threads.");
+                toi_start_other_threads();
+        }
+
+        result = usermodehelper_disable();
+        if (result) {
+                printk(KERN_ERR "TuxOnIce: Failed to disable usermode "
+                                "helpers\n");
+                set_result_state(TOI_USERMODE_HELPERS_ERR);
+                return 1;
+        }
+
+        boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP);
+        if (!boot_kernel_data_buffer) {
+                printk(KERN_ERR "TuxOnIce: Failed to allocate "
+                                "boot_kernel_data_buffer.\n");
+                set_result_state(TOI_OUT_OF_MEMORY);
+                return 1;
+        }
+
+        toi_allocate_cbw_data();
+
+        return 0;
+}
+
+/**
+ * can_hibernate - perform basic 'Can we hibernate?' tests
+ *
+ * Perform basic tests that must pass if we're going to be able to hibernate:
+ * Can we get the pm_mutex? Is resume= valid (we need to know where to write
+ * the image header).
+ **/
+static int can_hibernate(void)
+{
+        if (!test_toi_state(TOI_CAN_HIBERNATE))
+                toi_attempt_to_parse_resume_device(0);
+
+        if (!test_toi_state(TOI_CAN_HIBERNATE)) {
+                printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n"
+                        "This may be because you haven't put something along "
+                        "the lines of\n\nresume=swap:/dev/hda1\n\n"
+                        "in lilo.conf or equivalent. (Where /dev/hda1 is your "
+                        "swap partition).\n");
+                set_abort_result(TOI_CANT_SUSPEND);
+                return 0;
+        }
+
+        if (strlen(alt_resume_param)) {
+                attempt_to_parse_alt_resume_param();
+
+                if (!strlen(alt_resume_param)) {
+                        printk(KERN_INFO "Alternate resume parameter now "
+                                        "invalid. Aborting.\n");
+                        set_abort_result(TOI_CANT_USE_ALT_RESUME);
+                        return 0;
+                }
+        }
+
+        return 1;
+}
+
+/**
+ * do_post_image_write - having written an image, figure out what to do next
+ *
+ * After writing an image, we might load an alternate image or power down.
+ * Powering down might involve hibernating to ram, in which case we also
+ * need to handle reloading pageset2.
+ **/
+static int do_post_image_write(void)
+{
+        /* If switching images fails, do normal powerdown */
+        if (alt_resume_param[0])
+                do_toi_step(STEP_RESUME_ALT_IMAGE);
+
+        toi_power_down();
+
+        barrier();
+        mb();
+        return 0;
+}
+
+/**
+ * __save_image - do the hard work of saving the image
+ *
+ * High level routine for getting the image saved. The key assumptions made
+ * are that processes have been frozen and sufficient memory is available.
+ *
+ * We also exit through here at resume time, coming back from toi_hibernate
+ * after the atomic restore. This is the reason for the toi_in_hibernate
+ * test.
+ **/
+static int __save_image(void)
+{
+        int temp_result, did_copy = 0;
+
+        toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
+
+        toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
+                " - Final values: %d and %d.",
+                pagedir1.size, pagedir2.size);
+
+        toi_cond_pause(1, "About to write pagedir2.");
+
+        temp_result = write_pageset(&pagedir2);
+
+        if (temp_result == -1 || test_result_state(TOI_ABORTED))
+                return 1;
+
+        toi_cond_pause(1, "About to copy pageset 1.");
+
+        if (test_result_state(TOI_ABORTED))
+                return 1;
+
+        toi_deactivate_storage(1);
+
+        toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
+
+        toi_in_hibernate = 1;
+
+        if (toi_go_atomic(PMSG_FREEZE, 1))
+                goto Failed;
+
+        temp_result = toi_hibernate();
+
+#ifdef CONFIG_KGDB
+        if (test_action_state(TOI_POST_RESUME_BREAKPOINT))
+                kgdb_breakpoint();
+#endif
+
+        if (!temp_result)
+                did_copy = 1;
+
+        /* We return here at resume time too! */
+        toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result);
+
+Failed:
+        if (toi_activate_storage(1))
+                panic("Failed to reactivate our storage.");
+
+        /* Resume time? */
+        if (!toi_in_hibernate) {
+                copyback_post();
+                return 0;
+        }
+
+        /* Nope. Hibernating. So, see if we can save the image... */
+
+        if (temp_result || test_result_state(TOI_ABORTED)) {
+                if (did_copy)
+                        goto abort_reloading_pagedir_two;
+                else
+                        return 1;
+        }
+
+        toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size,
+                        NULL);
+
+        if (test_result_state(TOI_ABORTED))
+                goto abort_reloading_pagedir_two;
+
+        toi_cond_pause(1, "About to write pageset1.");
+
+        toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1");
+
+        temp_result = write_pageset(&pagedir1);
+
+        /* We didn't overwrite any memory, so no reread needs to be done. */
+        if (test_action_state(TOI_TEST_FILTER_SPEED) ||
+            test_action_state(TOI_TEST_BIO))
+                return 1;
+
+        if (temp_result == 1 || test_result_state(TOI_ABORTED))
+                goto abort_reloading_pagedir_two;
+
+        toi_cond_pause(1, "About to write header.");
+
+        if (test_result_state(TOI_ABORTED))
+                goto abort_reloading_pagedir_two;
+
+        temp_result = write_image_header();
+
+        if (!temp_result && !test_result_state(TOI_ABORTED))
+                return 0;
+
+abort_reloading_pagedir_two:
+        temp_result = read_pageset2(1);
+
+        /* If that failed, we're sunk. Panic! */
+        if (temp_result)
+                panic("Attempt to reload pagedir 2 while aborting "
+                                "a hibernate failed.");
+
+        return 1;
+}
+
+static void map_ps2_pages(int enable)
+{
+        unsigned long pfn = 0;
+
+        memory_bm_position_reset(pageset2_map);
+        pfn = memory_bm_next_pfn(pageset2_map, 0);
+
+        while (pfn != BM_END_OF_MAP) {
+                struct page *page = pfn_to_page(pfn);
+                kernel_map_pages(page, 1, enable);
+                pfn = memory_bm_next_pfn(pageset2_map, 0);
+        }
+}
+
+/**
+ * do_save_image - save the image and handle the result
+ *
+ * Save the prepared image. If we fail or we're in the path returning
+ * from the atomic restore, cleanup.
+ **/
+static int do_save_image(void)
+{
+        int result;
+        map_ps2_pages(0);
+        result = __save_image();
+        map_ps2_pages(1);
+        return result;
+}
+
+/**
+ * do_prepare_image - try to prepare an image
+ *
+ * Seek to initialise and prepare an image to be saved. On failure,
+ * cleanup.
+ **/
+static int do_prepare_image(void)
+{
+        int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
+
+        if (!restarting && toi_activate_storage(0))
+                return 1;
+
+        /*
+         * If kept image and still keeping image and hibernating to RAM, (non
+         * incremental image case) we will return 1 after hibernating and
+         * resuming (provided the power doesn't run out. In that case, we skip
+         * directly to cleaning up and exiting.
+         */
+
+        if (!can_hibernate() ||
+            (test_result_state(TOI_KEPT_IMAGE) &&
+             check_still_keeping_image()))
+                return 1;
+
+        if (toi_init(restarting) || toi_prepare_image() ||
+                        test_result_state(TOI_ABORTED))
+                return 1;
+
+        trap_non_toi_io = 1;
+
+        return 0;
+}
+
+/**
+ * do_check_can_resume - find out whether an image has been stored
+ *
+ * Read whether an image exists. We use the same routine as the
+ * image_exists sysfs entry, and just look to see whether the
+ * first character in the resulting buffer is a '1'.
+ **/
+int do_check_can_resume(void)
+{
+        int result = -1;
+
+        if (toi_activate_storage(0))
+                return -1;
+
+        if (!test_toi_state(TOI_RESUME_DEVICE_OK))
+                toi_attempt_to_parse_resume_device(1);
+
+        if (toiActiveAllocator)
+                result = toiActiveAllocator->image_exists(1);
+
+        toi_deactivate_storage(0);
+        return result;
+}
+
+/**
+ * do_load_atomic_copy - load the first part of an image, if it exists
+ *
+ * Check whether we have an image. If one exists, do sanity checking
+ * (possibly invalidating the image or even rebooting if the user
+ * requests that) before loading it into memory in preparation for the
+ * atomic restore.
+ *
+ * If and only if we have an image loaded and ready to restore, we return 1.
+ **/
+static int do_load_atomic_copy(void)
+{
+        int read_image_result = 0;
+
+        if (sizeof(swp_entry_t) != sizeof(long)) {
+                printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size"
+                        " of long. Please report this!\n");
+                return 1;
+        }
+
+        if (!resume_file[0])
+                printk(KERN_WARNING "TuxOnIce: "
+                        "You need to use a resume= command line parameter to "
+                        "tell TuxOnIce where to look for an image.\n");
+
+        toi_activate_storage(0);
+
+        if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) &&
+                !toi_attempt_to_parse_resume_device(0)) {
+                /*
+                 * Without a usable storage device we can do nothing -
+                 * even if noresume is given
+                 */
+
+                if (!toiNumAllocators)
+                        printk(KERN_ALERT "TuxOnIce: "
+                          "No storage allocators have been registered.\n");
+                else
+                        printk(KERN_ALERT "TuxOnIce: "
+                                "Missing or invalid storage location "
+                                "(resume= parameter). Please correct and "
+                                "rerun lilo (or equivalent) before "
+                                "hibernating.\n");
+                toi_deactivate_storage(0);
+                return 1;
+        }
+
+        if (allocate_bitmaps())
+                return 1;
+
+        read_image_result = read_pageset1(); /* non fatal error ignored */
+
+        if (test_toi_state(TOI_NORESUME_SPECIFIED))
+                clear_toi_state(TOI_NORESUME_SPECIFIED);
+
+        toi_deactivate_storage(0);
+
+        if (read_image_result)
+                return 1;
+
+        return 0;
+}
+
+/**
+ * prepare_restore_load_alt_image - save & restore alt image variables
+ *
+ * Save and restore the pageset1 maps, when loading an alternate image.
+ **/
+static void prepare_restore_load_alt_image(int prepare)
+{
+        static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save;
+
+        if (prepare) {
+                pageset1_map_save = pageset1_map;
+                pageset1_map = NULL;
+                pageset1_copy_map_save = pageset1_copy_map;
+                pageset1_copy_map = NULL;
+                set_toi_state(TOI_LOADING_ALT_IMAGE);
+                toi_reset_alt_image_pageset2_pfn();
+        } else {
+                toi_free_bitmap(&pageset1_map);
+                pageset1_map = pageset1_map_save;
+                toi_free_bitmap(&pageset1_copy_map);
+                pageset1_copy_map = pageset1_copy_map_save;
+                clear_toi_state(TOI_NOW_RESUMING);
+                clear_toi_state(TOI_LOADING_ALT_IMAGE);
+        }
+}
+
+/**
+ * do_toi_step - perform a step in hibernating or resuming
+ *
+ * Perform a step in hibernating or resuming an image. This abstraction
+ * is in preparation for implementing cluster support, and perhaps replacing
+ * uswsusp too (haven't looked whether that's possible yet).
+ **/
+int do_toi_step(int step)
+{
+        switch (step) {
+        case STEP_HIBERNATE_PREPARE_IMAGE:
+                return do_prepare_image();
+        case STEP_HIBERNATE_SAVE_IMAGE:
+                return do_save_image();
+        case STEP_HIBERNATE_POWERDOWN:
+                return do_post_image_write();
+        case STEP_RESUME_CAN_RESUME:
+                return do_check_can_resume();
+        case STEP_RESUME_LOAD_PS1:
+                return do_load_atomic_copy();
+        case STEP_RESUME_DO_RESTORE:
+                /*
+                 * If we succeed, this doesn't return.
+                 * Instead, we return from do_save_image() in the
+                 * hibernated kernel.
+                 */
+                return toi_atomic_restore();
+        case STEP_RESUME_ALT_IMAGE:
+                printk(KERN_INFO "Trying to resume alternate image.\n");
+                toi_in_hibernate = 0;
+                save_restore_alt_param(SAVE, NOQUIET);
+                prepare_restore_load_alt_image(1);
+                if (!do_check_can_resume()) {
+                        printk(KERN_INFO "Nothing to resume from.\n");
+                        goto out;
+                }
+                if (!do_load_atomic_copy())
+                        toi_atomic_restore();
+
+                printk(KERN_INFO "Failed to load image.\n");
+out:
+                prepare_restore_load_alt_image(0);
+                save_restore_alt_param(RESTORE, NOQUIET);
+                break;
+        case STEP_CLEANUP:
+                do_cleanup(1, 0);
+                break;
+        case STEP_QUIET_CLEANUP:
+                do_cleanup(0, 0);
+                break;
+        }
+
+        return 0;
+}
+
+/* -- Functions for kickstarting a hibernate or resume --- */
+
+/**
+ * toi_try_resume - try to do the steps in resuming
+ *
+ * Check if we have an image and if so try to resume. Clear the status
+ * flags too.
+ **/
+void toi_try_resume(void)
+{
+        set_toi_state(TOI_TRYING_TO_RESUME);
+        resume_attempted = 1;
+
+        current->flags |= PF_MEMALLOC;
+        toi_start_other_threads();
+
+        if (do_toi_step(STEP_RESUME_CAN_RESUME) &&
+                        !do_toi_step(STEP_RESUME_LOAD_PS1))
+                do_toi_step(STEP_RESUME_DO_RESTORE);
+
+        toi_stop_other_threads();
+        do_cleanup(0, 0);
+
+        current->flags &= ~PF_MEMALLOC;
+
+        clear_toi_state(TOI_IGNORE_LOGLEVEL);
+        clear_toi_state(TOI_TRYING_TO_RESUME);
+        clear_toi_state(TOI_NOW_RESUMING);
+}
+
+/**
+ * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume
+ *
+ * Wrapper for when __toi_try_resume is called from swsusp resume path,
+ * rather than from echo > /sys/power/tuxonice/do_resume.
+ **/
+static void toi_sys_power_disk_try_resume(void)
+{
+        resume_attempted = 1;
+
+        /*
+         * There's a comment in kernel/power/disk.c that indicates
+         * we should be able to use mutex_lock_nested below. That
+         * doesn't seem to cut it, though, so let's just turn lockdep
+         * off for now.
+         */
+        lockdep_off();
+
+        if (toi_start_anything(SYSFS_RESUMING))
+                goto out;
+
+        toi_try_resume();
+
+        /*
+         * For initramfs, we have to clear the boot time
+         * flag after trying to resume
+         */
+        clear_toi_state(TOI_BOOT_TIME);
+
+        toi_finish_anything(SYSFS_RESUMING);
+out:
+        lockdep_on();
+}
+
+/**
+ * toi_try_hibernate - try to start a hibernation cycle
+ *
+ * Start a hibernation cycle, coming in from either
+ * echo > /sys/power/tuxonice/do_suspend
+ *
+ * or
+ *
+ * echo disk > /sys/power/state
+ *
+ * In the later case, we come in without pm_sem taken; in the
+ * former, it has been taken.
+ **/
+int toi_try_hibernate(void)
+{
+        int result = 0, sys_power_disk = 0, retries = 0;
+
+        if (!mutex_is_locked(&tuxonice_in_use)) {
+                /* Came in via /sys/power/disk */
+                if (toi_start_anything(SYSFS_HIBERNATING))
+                        return -EBUSY;
+                sys_power_disk = 1;
+        }
+
+        current->flags |= PF_MEMALLOC;
+
+        if (test_toi_state(TOI_CLUSTER_MODE)) {
+                toi_initiate_cluster_hibernate();
+                goto out;
+        }
+
+prepare:
+        result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
+
+        if (result)
+                goto out;
+
+        if (test_action_state(TOI_FREEZER_TEST))
+                goto out_restore_gfp_mask;
+
+        result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
+
+        if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) {
+                if (retries < 2) {
+                        do_cleanup(0, 1);
+                        retries++;
+                        clear_result_state(TOI_ABORTED);
+                        extra_pd1_pages_allowance = extra_pd1_pages_used + 500;
+                        printk(KERN_INFO "Automatically adjusting the extra"
+                                " pages allowance to %ld and restarting.\n",
+                                extra_pd1_pages_allowance);
+                        pm_restore_gfp_mask();
+                        goto prepare;
+                }
+
+                printk(KERN_INFO "Adjusted extra pages allowance twice and "
+                        "still couldn't hibernate successfully. Giving up.");
+        }
+
+        /* This code runs at resume time too! */
+        if (!result && toi_in_hibernate)
+                result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
+
+out_restore_gfp_mask:
+        pm_restore_gfp_mask();
+out:
+        do_cleanup(1, 0);
+        current->flags &= ~PF_MEMALLOC;
+
+        if (sys_power_disk)
+                toi_finish_anything(SYSFS_HIBERNATING);
+
+        return result;
+}
+
+/*
+ * channel_no: If !0, -c <channel_no> is added to args (userui).
+ */
+int toi_launch_userspace_program(char *command, int channel_no,
+                int wait, int debug)
+{
+        int retval;
+        static char *envp[] = {
+                        "HOME=/",
+                        "TERM=linux",
+                        "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                        NULL };
+        static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
+                };
+        char *channel = NULL;
+        int arg = 0, size;
+        char test_read[255];
+        char *orig_posn = command;
+
+        if (!strlen(orig_posn))
+                return 1;
+
+        if (channel_no) {
+                channel = toi_kzalloc(4, 6, GFP_KERNEL);
+                if (!channel) {
+                        printk(KERN_INFO "Failed to allocate memory in "
+                                "preparing to launch userspace program.\n");
+                        return 1;
+                }
+        }
+
+        /* Up to 6 args supported */
+        while (arg < 6) {
+                sscanf(orig_posn, "%s", test_read);
+                size = strlen(test_read);
+                if (!(size))
+                        break;
+                argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP);
+                strcpy(argv[arg], test_read);
+                orig_posn += size + 1;
+                *test_read = 0;
+                arg++;
+        }
+
+        if (channel_no) {
+                sprintf(channel, "-c%d", channel_no);
+                argv[arg] = channel;
+        } else
+                arg--;
+
+        if (debug) {
+                argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP);
+                strcpy(argv[arg], "--debug");
+        }
+
+        retval = call_usermodehelper(argv[0], argv, envp, wait);
+
+        /*
+         * If the program reports an error, retval = 256. Don't complain
+         * about that here.
+         */
+        if (retval && retval != 256)
+                printk(KERN_ERR "Failed to launch userspace program '%s': "
+                                "Error %d\n", command, retval);
+
+        {
+                int i;
+                for (i = 0; i < arg; i++)
+                        if (argv[i] && argv[i] != channel)
+                                toi_kfree(5, argv[i], sizeof(*argv[i]));
+        }
+
+        toi_kfree(4, channel, sizeof(*channel));
+
+        return retval;
+}
+
+/*
+ * This array contains entries that are automatically registered at
+ * boot. Modules and the console code register their own entries separately.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_LONG("extra_pages_allowance", SYSFS_RW,
+                        &extra_pd1_pages_allowance, 0, LONG_MAX, 0),
+        SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read,
+                        image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL),
+        SYSFS_STRING("resume", SYSFS_RW, resume_file, 255,
+                        SYSFS_NEEDS_SM_FOR_WRITE,
+                        attempt_to_parse_resume_device2),
+        SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255,
+                        SYSFS_NEEDS_SM_FOR_WRITE,
+                        attempt_to_parse_alt_resume_param),
+        SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0,
+                        NULL),
+        SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_IGNORE_ROOTFS, 0),
+        SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2,
+                        INT_MAX, 0),
+        SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0),
+        SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_NO_MULTITHREADED_IO, 0),
+        SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_NO_FLUSHER_THREAD, 0),
+        SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_PAGESET2_FULL, 0),
+        SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0),
+        SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_REPLACE_SWSUSP, 0),
+        SYSFS_STRING("resume_commandline", SYSFS_RW,
+                        toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0,
+                        NULL),
+        SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL),
+        SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_FREEZER_TEST, 0),
+        SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0),
+        SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_TEST_FILTER_SPEED, 0),
+        SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_NO_PAGESET2, 0),
+        SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_NO_PS2_IF_UNNEEDED, 0),
+        SYSFS_STRING("binary_signature", SYSFS_READONLY,
+                        tuxonice_signature, 9, 0, NULL),
+        SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0,
+                        NULL),
+#ifdef CONFIG_KGDB
+        SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_POST_RESUME_BREAKPOINT, 0),
+#endif
+        SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_NO_READAHEAD, 0),
+        SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_TRACE_DEBUG_ON, 0),
+#ifdef CONFIG_TOI_KEEP_IMAGE
+        SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE,
+                        0),
+#endif
+#ifdef CONFIG_TOI_INCREMENTAL
+        SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0,
+                        NULL),
+        SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_INCREMENTAL_IMAGE, 1),
+#endif
+};
+
+static struct toi_core_fns my_fns = {
+        .get_nonconflicting_page = __toi_get_nonconflicting_page,
+        .post_context_save = __toi_post_context_save,
+        .try_hibernate = toi_try_hibernate,
+        .try_resume = toi_sys_power_disk_try_resume,
+};
+
+/**
+ * core_load - initialisation of TuxOnIce core
+ *
+ * Initialise the core, beginning with sysfs. Checksum and so on are part of
+ * the core, but have their own initialisation routines because they either
+ * aren't compiled in all the time or have their own subdirectories.
+ **/
+static __init int core_load(void)
+{
+        int i,
+            numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
+
+        printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION
+                        " (http://tuxonice.net)\n");
+
+        if (!hibernation_available()) {
+          printk(KERN_INFO "TuxOnIce disabled due to request for hibernation"
+              " to be disabled in this kernel.\n");
+          return 1;
+        }
+
+        if (toi_sysfs_init())
+                return 1;
+
+        for (i = 0; i < numfiles; i++)
+                toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
+
+        toi_core_fns = &my_fns;
+
+        if (toi_alloc_init())
+                return 1;
+        if (toi_checksum_init())
+                return 1;
+        if (toi_usm_init())
+                return 1;
+        if (toi_ui_init())
+                return 1;
+        if (toi_poweroff_init())
+                return 1;
+        if (toi_cluster_init())
+                return 1;
+        if (toi_cbw_init())
+                return 1;
+
+        return 0;
+}
+
+late_initcall(core_load);
diff --git b/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c
new file mode 100644
index 0000000..a8c5f36
--- /dev/null
+++ b/kernel/power/tuxonice_incremental.c
@@ -0,0 +1,402 @@
+/*
+ * kernel/power/tuxonice_incremental.c
+ *
+ * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains routines related to storing incremental images - that
+ * is, retaining an image after an initial cycle and then storing incremental
+ * changes on subsequent hibernations.
+ *
+ * Based in part on on...
+ *
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/mm.h>
+#include <linux/tuxonice.h>
+#include <linux/sched.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+#include "tuxonice_pageflags.h"
+#include "tuxonice_builtin.h"
+#include "power.h"
+
+int toi_do_incremental_initcall;
+
+extern void kdb_init(int level);
+extern noinline void kgdb_breakpoint(void);
+
+#undef pr_debug
+#if 0
+#define pr_debug(a, b...) do { printk(a, ##b); } while(0)
+#else
+#define pr_debug(a, b...) do { } while(0)
+#endif
+
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+
+/*
+ * This function gets called on a break in a continuous series
+ * of PTE entries; the next one is different so we need to
+ * print what we collected so far.
+ */
+static void note_page(void *addr)
+{
+    static struct page *lastpage;
+    struct page *page;
+
+    page = virt_to_page(addr);
+
+    if (page != lastpage) {
+        unsigned int level;
+        pte_t *pte = lookup_address((unsigned long) addr, &level);
+        struct page *pt_page2 = pte_page(*pte);
+        //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2));
+        SetPageTOI_Untracked(pt_page2);
+        lastpage = page;
+    }
+}
+
+static void walk_pte_level(pmd_t addr)
+{
+        int i;
+        pte_t *start;
+
+        start = (pte_t *) pmd_page_vaddr(addr);
+        for (i = 0; i < PTRS_PER_PTE; i++) {
+                note_page(start);
+                start++;
+        }
+}
+
+#if PTRS_PER_PMD > 1
+
+static void walk_pmd_level(pud_t addr)
+{
+        int i;
+        pmd_t *start;
+
+        start = (pmd_t *) pud_page_vaddr(addr);
+        for (i = 0; i < PTRS_PER_PMD; i++) {
+                if (!pmd_none(*start)) {
+                        if (pmd_large(*start) || !pmd_present(*start))
+                                note_page(start);
+                        else
+                                walk_pte_level(*start);
+                } else
+                        note_page(start);
+                start++;
+        }
+}
+
+#else
+#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a)))
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
+#endif
+
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(pgd_t addr)
+{
+        int i;
+        pud_t *start;
+
+        start = (pud_t *) pgd_page_vaddr(addr);
+
+        for (i = 0; i < PTRS_PER_PUD; i++) {
+                if (!pud_none(*start)) {
+                        if (pud_large(*start) || !pud_present(*start))
+                                note_page(start);
+                        else
+                                walk_pmd_level(*start);
+                } else
+                        note_page(start);
+
+                start++;
+        }
+}
+
+#else
+#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a)))
+#define pgd_large(a) pud_large(__pud(pgd_val(a)))
+#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
+#endif
+
+/*
+ * Not static in the original at the time of writing, so needs renaming here.
+ */
+static void toi_ptdump_walk_pgd_level(pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+        pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+        pgd_t *start = swapper_pg_dir;
+#endif
+        int i;
+        if (pgd) {
+                start = pgd;
+        }
+
+        for (i = 0; i < PTRS_PER_PGD; i++) {
+                if (!pgd_none(*start)) {
+                        if (pgd_large(*start) || !pgd_present(*start))
+                                note_page(start);
+                        else
+                                walk_pud_level(*start);
+                } else
+                        note_page(start);
+
+                start++;
+        }
+
+        /* Flush out the last page */
+        note_page(start);
+}
+
+#ifdef CONFIG_PARAVIRT
+extern struct pv_info pv_info;
+
+static void toi_set_paravirt_ops_untracked(void) {
+    int i;
+
+    unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)),
+                  pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end));
+    //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end);
+    for (i = pvpfn; i <= pvpfn_end; i++) {
+        SetPageTOI_Untracked(pfn_to_page(i));
+    }
+}
+#else
+#define toi_set_paravirt_ops_untracked() { do { } while(0) }
+#endif
+
+extern void toi_mark_per_cpus_pages_untracked(void);
+
+void toi_untrack_stack(unsigned long *stack)
+{
+    int i;
+    struct page *stack_page = virt_to_page(stack);
+
+    for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) {
+        pr_debug("Untrack stack page %p.\n", page_address(stack_page + i));
+        SetPageTOI_Untracked(stack_page + i);
+    }
+}
+void toi_untrack_process(struct task_struct *p)
+{
+    SetPageTOI_Untracked(virt_to_page(p));
+    pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p)));
+
+    toi_untrack_stack(p->stack);
+}
+
+void toi_generate_untracked_map(void)
+{
+    struct task_struct *p, *t;
+    struct page *page;
+    pte_t *pte;
+    int i;
+    unsigned int level;
+    static int been_here = 0;
+
+    if (been_here)
+        return;
+
+    been_here = 1;
+
+    /* Pagetable pages */
+    toi_ptdump_walk_pgd_level(NULL);
+
+    /* Printk buffer - not normally needed but can be helpful for debugging. */
+    //toi_set_logbuf_untracked();
+
+    /* Paravirt ops */
+    toi_set_paravirt_ops_untracked();
+
+    /* Task structs and stacks */
+    for_each_process_thread(p, t) {
+        toi_untrack_process(p);
+        //toi_untrack_stack((unsigned long *) t->thread.sp);
+    }
+
+    for (i = 0; i < NR_CPUS; i++) {
+        struct task_struct *idle = idle_task(i);
+
+        if (idle) {
+            pr_debug("Untrack idle process for CPU %d.\n", i);
+            toi_untrack_process(idle);
+        }
+
+        /* IRQ stack */
+        pr_debug("Untrack IRQ stack for CPU %d.\n", i);
+        toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i));
+    }
+
+    /* Per CPU data */
+    //pr_debug("Untracking per CPU variable pages.\n");
+    toi_mark_per_cpus_pages_untracked();
+
+    /* Init stack - for bringing up secondary CPUs */
+    page = virt_to_page(init_stack);
+    for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) {
+        SetPageTOI_Untracked(page + i);
+    }
+
+    pte = lookup_address((unsigned long) &mmu_cr4_features, &level);
+    SetPageTOI_Untracked(pte_page(*pte));
+    SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features));
+}
+
+/**
+ * toi_reset_dirtiness_one
+ */
+
+void toi_reset_dirtiness_one(unsigned long pfn, int verbose)
+{
+    struct page *page = pfn_to_page(pfn);
+
+    /**
+     * Don't worry about whether the Dirty flag is
+     * already set. If this is our first call, it
+     * won't be.
+     */
+
+    preempt_disable();
+
+    ClearPageTOI_Dirty(page);
+    SetPageTOI_RO(page);
+    if (verbose)
+        printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page));
+
+    set_memory_ro((unsigned long) page_address(page), 1);
+
+    preempt_enable();
+}
+
+/**
+ * TuxOnIce's incremental image support works by marking all memory apart from
+ * the page tables read-only, then in the page-faults that result enabling
+ * writing if appropriate and flagging the page as dirty. Free pages are also
+ * marked as dirty and not protected so that if allocated, they will be included
+ * in the image without further processing.
+ *
+ * toi_reset_dirtiness is called when and image exists and incremental images are
+ * enabled, and each time we resume thereafter. It is not invoked on a fresh boot.
+ *
+ * This routine should be called from a single-cpu-running context to avoid races in setting
+ * page dirty/read only flags.
+ *
+ * TODO: Make "it is not invoked on a fresh boot" true  when I've finished developing it!
+ *
+ * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c.
+ **/
+
+int toi_reset_dirtiness(int verbose)
+{
+        struct zone *zone;
+        unsigned long loop;
+        int allocated_map = 0;
+
+        toi_generate_untracked_map();
+
+        if (!free_map) {
+            if (!toi_alloc_bitmap(&free_map))
+                return -ENOMEM;
+            allocated_map = 1;
+        }
+
+        toi_generate_free_page_map();
+
+        pr_debug(KERN_EMERG "Reset dirtiness.\n");
+        for_each_populated_zone(zone) {
+            // 64 bit only. No need to worry about highmem.
+            for (loop = 0; loop < zone->spanned_pages; loop++) {
+                unsigned long pfn = zone->zone_start_pfn + loop;
+                struct page *page;
+                int chunk_size;
+
+                if (!pfn_valid(pfn)) {
+                    continue;
+                }
+
+                chunk_size = toi_size_of_free_region(zone, pfn);
+                if (chunk_size) {
+                    loop += chunk_size - 1;
+                    continue;
+                }
+
+                page = pfn_to_page(pfn);
+
+                if (PageNosave(page) || !saveable_page(zone, pfn)) {
+                    continue;
+                }
+
+                if (PageTOI_Untracked(page)) {
+                    continue;
+                }
+
+                /**
+                 * Do we need to (re)protect the page?
+                 * If it is already protected (PageTOI_RO), there is
+                 * nothing to do - skip the following.
+                 * If it is marked as dirty (PageTOI_Dirty), it was
+                 * either free and has been allocated or has been
+                 * written to and marked dirty. Reset the dirty flag
+                 * and (re)apply the protection.
+                 */
+                if (!PageTOI_RO(page)) {
+                    toi_reset_dirtiness_one(pfn, verbose);
+                }
+            }
+        }
+
+        pr_debug(KERN_EMERG "Done resetting dirtiness.\n");
+
+        if (allocated_map) {
+            toi_free_bitmap(&free_map);
+        }
+        return 0;
+}
+
+static int toi_reset_dirtiness_initcall(void)
+{
+    if (toi_do_incremental_initcall) {
+        pr_info("TuxOnIce: Enabling dirty page tracking.\n");
+        toi_reset_dirtiness(0);
+    }
+    return 1;
+}
+extern void toi_generate_untracked_map(void);
+
+// Leave early_initcall for pages to register untracked sections.
+early_initcall(toi_reset_dirtiness_initcall);
+
+static int __init toi_incremental_initcall_setup(char *str)
+{
+        int value;
+
+        if (sscanf(str, "=%d", &value) && value)
+                toi_do_incremental_initcall = value;
+
+        return 1;
+}
+__setup("toi_incremental_initcall", toi_incremental_initcall_setup);
diff --git b/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c
new file mode 100644
index 0000000..2db9343
--- /dev/null
+++ b/kernel/power/tuxonice_io.c
@@ -0,0 +1,1936 @@
+/*
+ * kernel/power/tuxonice_io.c
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains high level IO routines for hibernating.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/version.h>
+#include <linux/utsname.h>
+#include <linux/mount.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+#include <linux/fs_struct.h>
+#include <linux/bio.h>
+#include <linux/fs_uuid.h>
+#include <linux/kmod.h>
+#include <asm/tlbflush.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_pageflags.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_checksum.h"
+#include "tuxonice_alloc.h"
+char alt_resume_param[256];
+
+/* Version read from image header at resume */
+static int toi_image_header_version;
+
+#define read_if_version(VERS, VAR, DESC, ERR_ACT) do {                                        \
+        if (likely(toi_image_header_version >= VERS))                                \
+                if (toiActiveAllocator->rw_header_chunk(READ, NULL,                \
+                                        (char *) &VAR, sizeof(VAR))) {                \
+                        abort_hibernate(TOI_FAILED_IO, "Failed to read DESC.");        \
+                        ERR_ACT;                                        \
+                }                                                                \
+} while(0)                                                                        \
+
+/* Variables shared between threads and updated under the mutex */
+static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result;
+static int io_index, io_nextupdate, io_pc, io_pc_step;
+static DEFINE_MUTEX(io_mutex);
+static DEFINE_PER_CPU(struct page *, last_sought);
+static DEFINE_PER_CPU(struct page *, last_high_page);
+static DEFINE_PER_CPU(char *, checksum_locn);
+static DEFINE_PER_CPU(struct pbe *, last_low_page);
+static atomic_t io_count;
+atomic_t toi_io_workers;
+
+static int using_flusher;
+
+DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher);
+
+int toi_bio_queue_flusher_should_finish;
+
+int toi_max_workers;
+
+static char *image_version_error = "The image header version is newer than " \
+        "this kernel supports.";
+
+struct toi_module_ops *first_filter;
+
+static atomic_t toi_num_other_threads;
+static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue);
+enum toi_worker_commands {
+        TOI_IO_WORKER_STOP,
+        TOI_IO_WORKER_RUN,
+        TOI_IO_WORKER_EXIT
+};
+static enum toi_worker_commands toi_worker_command;
+
+/**
+ * toi_attempt_to_parse_resume_device - determine if we can hibernate
+ *
+ * Can we hibernate, using the current resume= parameter?
+ **/
+int toi_attempt_to_parse_resume_device(int quiet)
+{
+        struct list_head *Allocator;
+        struct toi_module_ops *thisAllocator;
+        int result, returning = 0;
+
+        if (toi_activate_storage(0))
+                return 0;
+
+        toiActiveAllocator = NULL;
+        clear_toi_state(TOI_RESUME_DEVICE_OK);
+        clear_toi_state(TOI_CAN_RESUME);
+        clear_result_state(TOI_ABORTED);
+
+        if (!toiNumAllocators) {
+                if (!quiet)
+                        printk(KERN_INFO "TuxOnIce: No storage allocators have "
+                                "been registered. Hibernating will be "
+                                "disabled.\n");
+                goto cleanup;
+        }
+
+        list_for_each(Allocator, &toiAllocators) {
+                thisAllocator = list_entry(Allocator, struct toi_module_ops,
+                                                                type_list);
+
+                /*
+                 * Not sure why you'd want to disable an allocator, but
+                 * we should honour the flag if we're providing it
+                 */
+                if (!thisAllocator->enabled)
+                        continue;
+
+                result = thisAllocator->parse_sig_location(
+                                resume_file, (toiNumAllocators == 1),
+                                quiet);
+
+                switch (result) {
+                case -EINVAL:
+                        /* For this allocator, but not a valid
+                         * configuration. Error already printed. */
+                        goto cleanup;
+
+                case 0:
+                        /* For this allocator and valid. */
+                        toiActiveAllocator = thisAllocator;
+
+                        set_toi_state(TOI_RESUME_DEVICE_OK);
+                        set_toi_state(TOI_CAN_RESUME);
+                        returning = 1;
+                        goto cleanup;
+                }
+        }
+        if (!quiet)
+                printk(KERN_INFO "TuxOnIce: No matching enabled allocator "
+                                "found. Resuming disabled.\n");
+cleanup:
+        toi_deactivate_storage(0);
+        return returning;
+}
+
+void attempt_to_parse_resume_device2(void)
+{
+        toi_prepare_usm();
+        toi_attempt_to_parse_resume_device(0);
+        toi_cleanup_usm();
+}
+
+void save_restore_alt_param(int replace, int quiet)
+{
+        static char resume_param_save[255];
+        static unsigned long toi_state_save;
+
+        if (replace) {
+                toi_state_save = toi_state;
+                strcpy(resume_param_save, resume_file);
+                strcpy(resume_file, alt_resume_param);
+        } else {
+                strcpy(resume_file, resume_param_save);
+                toi_state = toi_state_save;
+        }
+        toi_attempt_to_parse_resume_device(quiet);
+}
+
+void attempt_to_parse_alt_resume_param(void)
+{
+        int ok = 0;
+
+        /* Temporarily set resume_param to the poweroff value */
+        if (!strlen(alt_resume_param))
+                return;
+
+        printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n");
+        save_restore_alt_param(SAVE, NOQUIET);
+        if (test_toi_state(TOI_CAN_RESUME))
+                ok = 1;
+
+        printk(KERN_INFO "=== Done ===\n");
+        save_restore_alt_param(RESTORE, QUIET);
+
+        /* If not ok, clear the string */
+        if (ok)
+                return;
+
+        printk(KERN_INFO "Can't resume from that location; clearing "
+                        "alt_resume_param.\n");
+        alt_resume_param[0] = '\0';
+}
+
+/**
+ * noresume_reset_modules - reset data structures in case of non resuming
+ *
+ * When we read the start of an image, modules (and especially the
+ * active allocator) might need to reset data structures if we
+ * decide to remove the image rather than resuming from it.
+ **/
+static void noresume_reset_modules(void)
+{
+        struct toi_module_ops *this_filter;
+
+        list_for_each_entry(this_filter, &toi_filters, type_list)
+                if (this_filter->noresume_reset)
+                        this_filter->noresume_reset();
+
+        if (toiActiveAllocator && toiActiveAllocator->noresume_reset)
+                toiActiveAllocator->noresume_reset();
+}
+
+/**
+ * fill_toi_header - fill the hibernate header structure
+ * @struct toi_header: Header data structure to be filled.
+ **/
+static int fill_toi_header(struct toi_header *sh)
+{
+        int i, error;
+
+        error = init_header((struct swsusp_info *) sh);
+        if (error)
+                return error;
+
+        sh->pagedir = pagedir1;
+        sh->pageset_2_size = pagedir2.size;
+        sh->param0 = toi_result;
+        sh->param1 = toi_bkd.toi_action;
+        sh->param2 = toi_bkd.toi_debug_state;
+        sh->param3 = toi_bkd.toi_default_console_level;
+        sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev;
+        for (i = 0; i < 4; i++)
+                sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2];
+        sh->bkd = boot_kernel_data_buffer;
+        return 0;
+}
+
+/**
+ * rw_init_modules - initialize modules
+ * @rw:                Whether we are reading of writing an image.
+ * @which:        Section of the image being processed.
+ *
+ * Iterate over modules, preparing the ones that will be used to read or write
+ * data.
+ **/
+static int rw_init_modules(int rw, int which)
+{
+        struct toi_module_ops *this_module;
+        /* Initialise page transformers */
+        list_for_each_entry(this_module, &toi_filters, type_list) {
+                if (!this_module->enabled)
+                        continue;
+                if (this_module->rw_init && this_module->rw_init(rw, which)) {
+                        abort_hibernate(TOI_FAILED_MODULE_INIT,
+                                "Failed to initialize the %s filter.",
+                                this_module->name);
+                        return 1;
+                }
+        }
+
+        /* Initialise allocator */
+        if (toiActiveAllocator->rw_init(rw, which)) {
+                abort_hibernate(TOI_FAILED_MODULE_INIT,
+                                "Failed to initialise the allocator.");
+                return 1;
+        }
+
+        /* Initialise other modules */
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!this_module->enabled ||
+                    this_module->type == FILTER_MODULE ||
+                    this_module->type == WRITER_MODULE)
+                        continue;
+                if (this_module->rw_init && this_module->rw_init(rw, which)) {
+                        set_abort_result(TOI_FAILED_MODULE_INIT);
+                        printk(KERN_INFO "Setting aborted flag due to module "
+                                        "init failure.\n");
+                        return 1;
+                }
+        }
+
+        return 0;
+}
+
+/**
+ * rw_cleanup_modules - cleanup modules
+ * @rw:        Whether we are reading of writing an image.
+ *
+ * Cleanup components after reading or writing a set of pages.
+ * Only the allocator may fail.
+ **/
+static int rw_cleanup_modules(int rw)
+{
+        struct toi_module_ops *this_module;
+        int result = 0;
+
+        /* Cleanup other modules */
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!this_module->enabled ||
+                    this_module->type == FILTER_MODULE ||
+                    this_module->type == WRITER_MODULE)
+                        continue;
+                if (this_module->rw_cleanup)
+                        result |= this_module->rw_cleanup(rw);
+        }
+
+        /* Flush data and cleanup */
+        list_for_each_entry(this_module, &toi_filters, type_list) {
+                if (!this_module->enabled)
+                        continue;
+                if (this_module->rw_cleanup)
+                        result |= this_module->rw_cleanup(rw);
+        }
+
+        result |= toiActiveAllocator->rw_cleanup(rw);
+
+        return result;
+}
+
+static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high)
+{
+        int index, min, max;
+        struct page *high_page = NULL,
+                    **my_last_high_page = raw_cpu_ptr(&last_high_page),
+                    **my_last_sought = raw_cpu_ptr(&last_sought);
+        struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page);
+        void *compare;
+
+        if (is_high) {
+                if (*my_last_sought && *my_last_high_page &&
+                                *my_last_sought < orig_page)
+                        high_page = *my_last_high_page;
+                else
+                        high_page = (struct page *) restore_highmem_pblist;
+                this = (struct pbe *) kmap(high_page);
+                compare = orig_page;
+        } else {
+                if (*my_last_sought && *my_last_low_page &&
+                                *my_last_sought < orig_page)
+                        this = *my_last_low_page;
+                else
+                        this = restore_pblist;
+                compare = page_address(orig_page);
+        }
+
+        *my_last_sought = orig_page;
+
+        /* Locate page containing pbe */
+        while (this[PBES_PER_PAGE - 1].next &&
+                        this[PBES_PER_PAGE - 1].orig_address < compare) {
+                if (is_high) {
+                        struct page *next_high_page = (struct page *)
+                                this[PBES_PER_PAGE - 1].next;
+                        kunmap(high_page);
+                        this = kmap(next_high_page);
+                        high_page = next_high_page;
+                } else
+                        this = this[PBES_PER_PAGE - 1].next;
+        }
+
+        /* Do a binary search within the page */
+        min = 0;
+        max = PBES_PER_PAGE;
+        index = PBES_PER_PAGE / 2;
+        while (max - min) {
+                if (!this[index].orig_address ||
+                    this[index].orig_address > compare)
+                        max = index;
+                else if (this[index].orig_address == compare) {
+                        if (is_high) {
+                                struct page *page = this[index].address;
+                                *my_last_high_page = high_page;
+                                kunmap(high_page);
+                                return page;
+                        }
+                        *my_last_low_page = this;
+                        return virt_to_page(this[index].address);
+                } else
+                        min = index;
+                index = ((max + min) / 2);
+        };
+
+        if (is_high)
+                kunmap(high_page);
+
+        abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for"
+                " orig page %p. This[min].orig_address=%p.\n", orig_page,
+                this[index].orig_address);
+        return NULL;
+}
+
+/**
+ * write_next_page - write the next page in a pageset
+ * @data_pfn: The pfn where the next data to write is located.
+ * @my_io_index: The index of the page in the pageset.
+ * @write_pfn: The pfn number to write in the image (where the data belongs).
+ *
+ * Get the pfn of the next page to write, map the page if necessary and do the
+ * write.
+ **/
+static int write_next_page(unsigned long *data_pfn, int *my_io_index,
+                unsigned long *write_pfn)
+{
+        struct page *page;
+        char **my_checksum_locn = raw_cpu_ptr(&checksum_locn);
+        int result = 0, was_present;
+
+        *data_pfn = memory_bm_next_pfn(io_map, 0);
+
+        /* Another thread could have beaten us to it. */
+        if (*data_pfn == BM_END_OF_MAP) {
+                if (atomic_read(&io_count)) {
+                        printk(KERN_INFO "Ran out of pfns but io_count is "
+                                        "still %d.\n", atomic_read(&io_count));
+                        BUG();
+                }
+                mutex_unlock(&io_mutex);
+                return -ENODATA;
+        }
+
+        *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
+
+        memory_bm_clear_bit(io_map, 0, *data_pfn);
+        page = pfn_to_page(*data_pfn);
+
+        was_present = kernel_page_present(page);
+        if (!was_present)
+                kernel_map_pages(page, 1, 1);
+
+        if (io_pageset == 1)
+                *write_pfn = memory_bm_next_pfn(pageset1_map, 0);
+        else {
+                *write_pfn = *data_pfn;
+                *my_checksum_locn = tuxonice_get_next_checksum();
+        }
+
+        TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index);
+
+        mutex_unlock(&io_mutex);
+
+        if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn))
+                return 1;
+
+        result = first_filter->write_page(*write_pfn, TOI_PAGE, page,
+                        PAGE_SIZE);
+
+        if (!was_present)
+                kernel_map_pages(page, 1, 0);
+
+        return result;
+}
+
+/**
+ * read_next_page - read the next page in a pageset
+ * @my_io_index: The index of the page in the pageset.
+ * @write_pfn: The pfn in which the data belongs.
+ *
+ * Read a page of the image into our buffer. It can happen (here and in the
+ * write routine) that threads don't get run until after other CPUs have done
+ * all the work. This was the cause of the long standing issue with
+ * occasionally getting -ENODATA errors at the end of reading the image. We
+ * therefore need to check there's actually a page to read before trying to
+ * retrieve one.
+ **/
+
+static int read_next_page(int *my_io_index, unsigned long *write_pfn,
+                struct page *buffer)
+{
+        unsigned int buf_size = PAGE_SIZE;
+        unsigned long left = atomic_read(&io_count);
+
+        if (!left)
+                return -ENODATA;
+
+        /* Start off assuming the page we read isn't resaved */
+        *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
+
+        mutex_unlock(&io_mutex);
+
+        /*
+         * Are we aborting? If so, don't submit any more I/O as
+         * resetting the resume_attempted flag (from ui.c) will
+         * clear the bdev flags, making this thread oops.
+         */
+        if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
+                atomic_dec(&toi_io_workers);
+                if (!atomic_read(&toi_io_workers)) {
+                        /*
+                         * So we can be sure we'll have memory for
+                         * marking that we haven't resumed.
+                         */
+                        rw_cleanup_modules(READ);
+                        set_toi_state(TOI_IO_STOPPED);
+                }
+                while (1)
+                        schedule();
+        }
+
+        /*
+         * See toi_bio_read_page in tuxonice_bio.c:
+         * read the next page in the image.
+         */
+        return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size);
+}
+
+static void use_read_page(unsigned long write_pfn, struct page *buffer)
+{
+        struct page *final_page = pfn_to_page(write_pfn),
+                    *copy_page = final_page;
+        char *virt, *buffer_virt;
+        int was_present, cpu = smp_processor_id();
+        unsigned long idx = 0;
+
+        if (io_pageset == 1 && (!pageset1_copy_map ||
+                        !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) {
+                int is_high = PageHighMem(final_page);
+                copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high);
+        }
+
+        if (!memory_bm_test_bit(io_map, cpu, write_pfn)) {
+                int test = !memory_bm_test_bit(io_map, cpu, write_pfn);
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test);
+                mutex_lock(&io_mutex);
+                idx = atomic_add_return(1, &io_count);
+                mutex_unlock(&io_mutex);
+                return;
+        }
+
+        virt = kmap(copy_page);
+        buffer_virt = kmap(buffer);
+        was_present = kernel_page_present(copy_page);
+        if (!was_present)
+                kernel_map_pages(copy_page, 1, 1);
+        memcpy(virt, buffer_virt, PAGE_SIZE);
+        if (!was_present)
+                kernel_map_pages(copy_page, 1, 0);
+        kunmap(copy_page);
+        kunmap(buffer);
+        memory_bm_clear_bit(io_map, cpu, write_pfn);
+        TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset);
+}
+
+static unsigned long status_update(int writing, unsigned long done,
+                unsigned long ticks)
+{
+        int cs_index = writing ? 0 : 1;
+        unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks;
+        unsigned long msec = jiffies_to_msecs(abs(ticks_so_far));
+        unsigned long pgs_per_s, estimate = 0, pages_left;
+
+        if (msec) {
+                pages_left = io_barmax - done;
+                pgs_per_s = 1000 * done / msec;
+                if (pgs_per_s)
+                        estimate = DIV_ROUND_UP(pages_left, pgs_per_s);
+        }
+
+        if (estimate && ticks > HZ / 2)
+                return toi_update_status(done, io_barmax,
+                        " %d/%d MB (%lu sec left)",
+                        MB(done+1), MB(io_barmax), estimate);
+
+        return toi_update_status(done, io_barmax, " %d/%d MB",
+                MB(done+1), MB(io_barmax));
+}
+
+/**
+ * worker_rw_loop - main loop to read/write pages
+ *
+ * The main I/O loop for reading or writing pages. The io_map bitmap is used to
+ * track the pages to read/write.
+ * If we are reading, the pages are loaded to their final (mapped) pfn.
+ * Data is non zero iff this is a thread started via start_other_threads.
+ * In that case, we stay in here until told to quit.
+ **/
+static int worker_rw_loop(void *data)
+{
+        unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4,
+                      jif_index = 1, start_time = jiffies, thread_num;
+        int result = 0, my_io_index = 0, last_worker;
+        struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP);
+        cpumask_var_t orig_mask;
+
+        if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) {
+                printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data);
+                result = -ENOMEM;
+                goto out;
+        }
+
+        cpumask_copy(orig_mask, tsk_cpus_allowed(current));
+
+        current->flags |= PF_NOFREEZE;
+
+top:
+        mutex_lock(&io_mutex);
+        thread_num = atomic_read(&toi_io_workers);
+
+        cpumask_copy(tsk_cpus_allowed(current), orig_mask);
+        schedule();
+
+        atomic_inc(&toi_io_workers);
+
+        while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) &&
+                !(io_write && test_result_state(TOI_ABORTED)) &&
+                toi_worker_command == TOI_IO_WORKER_RUN) {
+                if (!thread_num && jiffies > next_jiffies) {
+                        next_jiffies += HZ / 4;
+                        if (toiActiveAllocator->update_throughput_throttle)
+                                toiActiveAllocator->update_throughput_throttle(
+                                                jif_index);
+                        jif_index++;
+                }
+
+                /*
+                 * What page to use? If reading, don't know yet which page's
+                 * data will be read, so always use the buffer. If writing,
+                 * use the copy (Pageset1) or original page (Pageset2), but
+                 * always write the pfn of the original page.
+                 */
+                if (io_write)
+                        result = write_next_page(&data_pfn, &my_io_index,
+                                        &write_pfn);
+                else /* Reading */
+                        result = read_next_page(&my_io_index, &write_pfn,
+                                        buffer);
+
+                if (result) {
+                        mutex_lock(&io_mutex);
+                        /* Nothing to do? */
+                        if (result == -ENODATA) {
+                                toi_message(TOI_IO, TOI_VERBOSE, 0,
+                                        "Thread %d has no more work.",
+                                        smp_processor_id());
+                                break;
+                        }
+
+                        io_result = result;
+
+                        if (io_write) {
+                                printk(KERN_INFO "Write chunk returned %d.\n",
+                                                result);
+                                abort_hibernate(TOI_FAILED_IO,
+                                        "Failed to write a chunk of the "
+                                        "image.");
+                                break;
+                        }
+
+                        if (io_pageset == 1) {
+                                printk(KERN_ERR "\nBreaking out of I/O loop "
+                                        "because of result code %d.\n", result);
+                                break;
+                        }
+                        panic("Read chunk returned (%d)", result);
+                }
+
+                /*
+                 * Discard reads of resaved pages while reading ps2
+                 * and unwanted pages while rereading ps2 when aborting.
+                 */
+                if (!io_write) {
+                        if (!PageResave(pfn_to_page(write_pfn)))
+                                use_read_page(write_pfn, buffer);
+                        else {
+                                mutex_lock(&io_mutex);
+                                toi_message(TOI_IO, TOI_VERBOSE, 0,
+                                                "Resaved %ld.", write_pfn);
+                                atomic_inc(&io_count);
+                                mutex_unlock(&io_mutex);
+                        }
+                }
+
+                if (!thread_num) {
+                        if(my_io_index + io_base > io_nextupdate)
+                                io_nextupdate = status_update(io_write,
+                                                my_io_index + io_base,
+                                                jiffies - start_time);
+
+                        if (my_io_index > io_pc) {
+                                printk(KERN_CONT "...%d%%", 20 * io_pc_step);
+                                io_pc_step++;
+                                io_pc = io_finish_at * io_pc_step / 5;
+                        }
+                }
+
+                toi_cond_pause(0, NULL);
+
+                /*
+                 * Subtle: If there's less I/O still to be done than threads
+                 * running, quit. This stops us doing I/O beyond the end of
+                 * the image when reading.
+                 *
+                 * Possible race condition. Two threads could do the test at
+                 * the same time; one should exit and one should continue.
+                 * Therefore we take the mutex before comparing and exiting.
+                 */
+
+                mutex_lock(&io_mutex);
+        }
+
+        last_worker = atomic_dec_and_test(&toi_io_workers);
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers));
+        mutex_unlock(&io_mutex);
+
+        if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) {
+                /* Were we the last thread and we're using a flusher thread? */
+                if (last_worker && using_flusher) {
+                        toiActiveAllocator->finish_all_io();
+                }
+                /* First, if we're doing I/O, wait for it to finish */
+                wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN);
+                /* Then wait to be told what to do next */
+                wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP);
+                if (toi_worker_command == TOI_IO_WORKER_RUN)
+                        goto top;
+        }
+
+        if (thread_num)
+                atomic_dec(&toi_num_other_threads);
+
+out:
+        toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num);
+        toi__free_page(28, buffer);
+        free_cpumask_var(orig_mask);
+
+        return result;
+}
+
+int toi_start_other_threads(void)
+{
+        int cpu;
+        struct task_struct *p;
+        int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1;
+  unsigned long num_started = 0;
+
+        if (test_action_state(TOI_NO_MULTITHREADED_IO))
+                return 0;
+
+        toi_worker_command = TOI_IO_WORKER_STOP;
+
+        for_each_online_cpu(cpu) {
+                if (num_started == to_start)
+                        break;
+
+                if (cpu == smp_processor_id())
+                        continue;
+
+                p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1,
+                                cpu_to_node(cpu), "ktoi_io/%d", cpu);
+                if (IS_ERR(p)) {
+                        printk(KERN_ERR "ktoi_io for %i failed\n", cpu);
+                        continue;
+                }
+                kthread_bind(p, cpu);
+                p->flags |= PF_MEMALLOC;
+                wake_up_process(p);
+                num_started++;
+                atomic_inc(&toi_num_other_threads);
+        }
+
+        toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started);
+        return num_started;
+}
+
+void toi_stop_other_threads(void)
+{
+        toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads.");
+        toi_worker_command = TOI_IO_WORKER_EXIT;
+        wake_up(&toi_worker_wait_queue);
+}
+
+/**
+ * do_rw_loop - main highlevel function for reading or writing pages
+ *
+ * Create the io_map bitmap and call worker_rw_loop to perform I/O operations.
+ **/
+static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags,
+                int base, int barmax, int pageset)
+{
+        int index = 0, cpu, result = 0, workers_started;
+        unsigned long pfn, next;
+
+        first_filter = toi_get_next_filter(NULL);
+
+        if (!finish_at)
+                return 0;
+
+        io_write = write;
+        io_finish_at = finish_at;
+        io_base = base;
+        io_barmax = barmax;
+        io_pageset = pageset;
+        io_index = 0;
+        io_pc = io_finish_at / 5;
+        io_pc_step = 1;
+        io_result = 0;
+        io_nextupdate = base + 1;
+        toi_bio_queue_flusher_should_finish = 0;
+
+        for_each_online_cpu(cpu) {
+                per_cpu(last_sought, cpu) = NULL;
+                per_cpu(last_low_page, cpu) = NULL;
+                per_cpu(last_high_page, cpu) = NULL;
+        }
+
+        /* Ensure all bits clear */
+        memory_bm_clear(io_map);
+
+        memory_bm_position_reset(io_map);
+        next = memory_bm_next_pfn(io_map, 0);
+
+        BUG_ON(next != BM_END_OF_MAP);
+
+        /* Set the bits for the pages to write */
+        memory_bm_position_reset(pageflags);
+
+        pfn = memory_bm_next_pfn(pageflags, 0);
+        toi_trace_index++;
+
+        while (pfn != BM_END_OF_MAP && index < finish_at) {
+                TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at);
+                memory_bm_set_bit(io_map, 0, pfn);
+                pfn = memory_bm_next_pfn(pageflags, 0);
+                index++;
+        }
+
+        BUG_ON(next != BM_END_OF_MAP || index < finish_at);
+
+        memory_bm_position_reset(io_map);
+        toi_trace_index++;
+
+        atomic_set(&io_count, finish_at);
+
+        memory_bm_position_reset(pageset1_map);
+
+        mutex_lock(&io_mutex);
+
+        clear_toi_state(TOI_IO_STOPPED);
+
+        using_flusher = (atomic_read(&toi_num_other_threads) &&
+                         toiActiveAllocator->io_flusher &&
+                         !test_action_state(TOI_NO_FLUSHER_THREAD));
+
+        workers_started = atomic_read(&toi_num_other_threads);
+
+        memory_bm_position_reset(io_map);
+        memory_bm_position_reset(pageset1_copy_map);
+
+        toi_worker_command = TOI_IO_WORKER_RUN;
+        wake_up(&toi_worker_wait_queue);
+
+        mutex_unlock(&io_mutex);
+
+        if (using_flusher)
+                result = toiActiveAllocator->io_flusher(write);
+        else
+                worker_rw_loop(NULL);
+
+        while (atomic_read(&toi_io_workers))
+                schedule();
+
+        printk(KERN_CONT "\n");
+
+        toi_worker_command = TOI_IO_WORKER_STOP;
+        wake_up(&toi_worker_wait_queue);
+
+        if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
+                if (!atomic_read(&toi_io_workers)) {
+                        rw_cleanup_modules(READ);
+                        set_toi_state(TOI_IO_STOPPED);
+                }
+                while (1)
+                        schedule();
+        }
+        set_toi_state(TOI_IO_STOPPED);
+
+        if (!io_result && !result && !test_result_state(TOI_ABORTED)) {
+                unsigned long next;
+
+                toi_update_status(io_base + io_finish_at, io_barmax,
+                                " %d/%d MB ",
+                                MB(io_base + io_finish_at), MB(io_barmax));
+
+                memory_bm_position_reset(io_map);
+                next = memory_bm_next_pfn(io_map, 0);
+                if  (next != BM_END_OF_MAP) {
+                        printk(KERN_INFO "Finished I/O loop but still work to "
+                                        "do?\nFinish at = %d. io_count = %d.\n",
+                                        finish_at, atomic_read(&io_count));
+                        printk(KERN_INFO "I/O bitmap still records work to do."
+                                        "%ld.\n", next);
+                        BUG();
+                        do {
+                                cpu_relax();
+                        } while (0);
+                }
+        }
+
+        return io_result ? io_result : result;
+}
+
+/**
+ * write_pageset - write a pageset to disk.
+ * @pagedir:        Which pagedir to write.
+ *
+ * Returns:
+ *        Zero on success or -1 on failure.
+ **/
+int write_pageset(struct pagedir *pagedir)
+{
+        int finish_at, base = 0;
+        int barmax = pagedir1.size + pagedir2.size;
+        long error = 0;
+        struct memory_bitmap *pageflags;
+        unsigned long start_time, end_time;
+
+        /*
+         * Even if there is nothing to read or write, the allocator
+         * may need the init/cleanup for it's housekeeping.  (eg:
+         * Pageset1 may start where pageset2 ends when writing).
+         */
+        finish_at = pagedir->size;
+
+        if (pagedir->id == 1) {
+                toi_prepare_status(DONT_CLEAR_BAR,
+                                "Writing kernel & process data...");
+                base = pagedir2.size;
+                if (test_action_state(TOI_TEST_FILTER_SPEED) ||
+                    test_action_state(TOI_TEST_BIO))
+                        pageflags = pageset1_map;
+                else
+                        pageflags = pageset1_copy_map;
+        } else {
+                toi_prepare_status(DONT_CLEAR_BAR, "Writing caches...");
+                pageflags = pageset2_map;
+        }
+
+        start_time = jiffies;
+
+        if (rw_init_modules(WRITE, pagedir->id)) {
+                abort_hibernate(TOI_FAILED_MODULE_INIT,
+                                "Failed to initialise modules for writing.");
+                error = 1;
+        }
+
+        if (!error)
+                error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax,
+                                pagedir->id);
+
+        if (rw_cleanup_modules(WRITE) && !error) {
+                abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
+                                "Failed to cleanup after writing.");
+                error = 1;
+        }
+
+        end_time = jiffies;
+
+        if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
+                toi_bkd.toi_io_time[0][0] += finish_at,
+                toi_bkd.toi_io_time[0][1] += (end_time - start_time);
+        }
+
+        return error;
+}
+
+/**
+ * read_pageset - highlevel function to read a pageset from disk
+ * @pagedir:                        pageset to read
+ * @overwrittenpagesonly:        Whether to read the whole pageset or
+ *                                only part of it.
+ *
+ * Returns:
+ *        Zero on success or -1 on failure.
+ **/
+static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly)
+{
+        int result = 0, base = 0;
+        int finish_at = pagedir->size;
+        int barmax = pagedir1.size + pagedir2.size;
+        struct memory_bitmap *pageflags;
+        unsigned long start_time, end_time;
+
+        if (pagedir->id == 1) {
+                toi_prepare_status(DONT_CLEAR_BAR,
+                                "Reading kernel & process data...");
+                pageflags = pageset1_map;
+        } else {
+                toi_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
+                if (overwrittenpagesonly) {
+                        barmax = min(pagedir1.size, pagedir2.size);
+                        finish_at = min(pagedir1.size, pagedir2.size);
+                } else
+                        base = pagedir1.size;
+                pageflags = pageset2_map;
+        }
+
+        start_time = jiffies;
+
+        if (rw_init_modules(READ, pagedir->id)) {
+                toiActiveAllocator->remove_image();
+                result = 1;
+        } else
+                result = do_rw_loop(READ, finish_at, pageflags, base, barmax,
+                                pagedir->id);
+
+        if (rw_cleanup_modules(READ) && !result) {
+                abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
+                                "Failed to cleanup after reading.");
+                result = 1;
+        }
+
+        /* Statistics */
+        end_time = jiffies;
+
+        if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
+                toi_bkd.toi_io_time[1][0] += finish_at,
+                toi_bkd.toi_io_time[1][1] += (end_time - start_time);
+        }
+
+        return result;
+}
+
+/**
+ * write_module_configs - store the modules configuration
+ *
+ * The configuration for each module is stored in the image header.
+ * Returns: Int
+ *        Zero on success, Error value otherwise.
+ **/
+static int write_module_configs(void)
+{
+        struct toi_module_ops *this_module;
+        char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP);
+        int len, index = 1;
+        struct toi_module_header toi_module_header;
+
+        if (!buffer) {
+                printk(KERN_INFO "Failed to allocate a buffer for saving "
+                                "module configuration info.\n");
+                return -ENOMEM;
+        }
+
+        /*
+         * We have to know which data goes with which module, so we at
+         * least write a length of zero for a module. Note that we are
+         * also assuming every module's config data takes <= PAGE_SIZE.
+         */
+
+        /* For each module (in registration order) */
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!this_module->enabled || !this_module->storage_needed ||
+                    (this_module->type == WRITER_MODULE &&
+                     toiActiveAllocator != this_module))
+                        continue;
+
+                /* Get the data from the module */
+                len = 0;
+                if (this_module->save_config_info)
+                        len = this_module->save_config_info(buffer);
+
+                /* Save the details of the module */
+                toi_module_header.enabled = this_module->enabled;
+                toi_module_header.type = this_module->type;
+                toi_module_header.index = index++;
+                strncpy(toi_module_header.name, this_module->name,
+                                        sizeof(toi_module_header.name));
+                toiActiveAllocator->rw_header_chunk(WRITE,
+                                this_module,
+                                (char *) &toi_module_header,
+                                sizeof(toi_module_header));
+
+                /* Save the size of the data and any data returned */
+                toiActiveAllocator->rw_header_chunk(WRITE,
+                                this_module,
+                                (char *) &len, sizeof(int));
+                if (len)
+                        toiActiveAllocator->rw_header_chunk(
+                                WRITE, this_module, buffer, len);
+        }
+
+        /* Write a blank header to terminate the list */
+        toi_module_header.name[0] = '\0';
+        toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+                        (char *) &toi_module_header, sizeof(toi_module_header));
+
+        toi_free_page(22, (unsigned long) buffer);
+        return 0;
+}
+
+/**
+ * read_one_module_config - read and configure one module
+ *
+ * Read the configuration for one module, and configure the module
+ * to match if it is loaded.
+ *
+ * Returns: Int
+ *        Zero on success, Error value otherwise.
+ **/
+static int read_one_module_config(struct toi_module_header *header)
+{
+        struct toi_module_ops *this_module;
+        int result, len;
+        char *buffer;
+
+        /* Find the module */
+        this_module = toi_find_module_given_name(header->name);
+
+        if (!this_module) {
+                if (header->enabled) {
+                        toi_early_boot_message(1, TOI_CONTINUE_REQ,
+                                "It looks like we need module %s for reading "
+                                "the image but it hasn't been registered.\n",
+                                header->name);
+                        if (!(test_toi_state(TOI_CONTINUE_REQ)))
+                                return -EINVAL;
+                } else
+                        printk(KERN_INFO "Module %s configuration data found, "
+                                "but the module hasn't registered. Looks like "
+                                "it was disabled, so we're ignoring its data.",
+                                header->name);
+        }
+
+        /* Get the length of the data (if any) */
+        result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len,
+                        sizeof(int));
+        if (result) {
+                printk(KERN_ERR "Failed to read the length of the module %s's"
+                                " configuration data.\n",
+                                header->name);
+                return -EINVAL;
+        }
+
+        /* Read any data and pass to the module (if we found one) */
+        if (!len)
+                return 0;
+
+        buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP);
+
+        if (!buffer) {
+                printk(KERN_ERR "Failed to allocate a buffer for reloading "
+                                "module configuration info.\n");
+                return -ENOMEM;
+        }
+
+        toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len);
+
+        if (!this_module)
+                goto out;
+
+        if (!this_module->save_config_info)
+                printk(KERN_ERR "Huh? Module %s appears to have a "
+                                "save_config_info, but not a load_config_info "
+                                "function!\n", this_module->name);
+        else
+                this_module->load_config_info(buffer, len);
+
+        /*
+         * Now move this module to the tail of its lists. This will put it in
+         * order. Any new modules will end up at the top of the lists. They
+         * should have been set to disabled when loaded (people will
+         * normally not edit an initrd to load a new module and then hibernate
+         * without using it!).
+         */
+
+        toi_move_module_tail(this_module);
+
+        this_module->enabled = header->enabled;
+
+out:
+        toi_free_page(23, (unsigned long) buffer);
+        return 0;
+}
+
+/**
+ * read_module_configs - reload module configurations from the image header.
+ *
+ * Returns: Int
+ *        Zero on success or an error code.
+ **/
+static int read_module_configs(void)
+{
+        int result = 0;
+        struct toi_module_header toi_module_header;
+        struct toi_module_ops *this_module;
+
+        /* All modules are initially disabled. That way, if we have a module
+         * loaded now that wasn't loaded when we hibernated, it won't be used
+         * in trying to read the data.
+         */
+        list_for_each_entry(this_module, &toi_modules, module_list)
+                this_module->enabled = 0;
+
+        /* Get the first module header */
+        result = toiActiveAllocator->rw_header_chunk(READ, NULL,
+                        (char *) &toi_module_header,
+                        sizeof(toi_module_header));
+        if (result) {
+                printk(KERN_ERR "Failed to read the next module header.\n");
+                return -EINVAL;
+        }
+
+        /* For each module (in registration order) */
+        while (toi_module_header.name[0]) {
+                result = read_one_module_config(&toi_module_header);
+
+                if (result)
+                        return -EINVAL;
+
+                /* Get the next module header */
+                result = toiActiveAllocator->rw_header_chunk(READ, NULL,
+                                (char *) &toi_module_header,
+                                sizeof(toi_module_header));
+
+                if (result) {
+                        printk(KERN_ERR "Failed to read the next module "
+                                        "header.\n");
+                        return -EINVAL;
+                }
+        }
+
+        return 0;
+}
+
+static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev)
+{
+        return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1;
+}
+
+int fs_info_space_needed(int reset)
+{
+    static int last_result = 0;
+    const struct super_block *sb;
+    int result = sizeof(int);
+
+    if (!last_result || reset) {
+        list_for_each_entry(sb, &super_blocks, s_list) {
+            struct fs_info *fs;
+
+            if (!sb->s_bdev)
+                continue;
+
+            fs = fs_info_from_block_dev(sb->s_bdev);
+            if (save_fs_info(fs, sb->s_bdev))
+                result += 16 + sizeof(dev_t) + sizeof(int) +
+                    fs->last_mount_size;
+            free_fs_info(fs);
+        }
+        last_result = result;
+    }
+    return result;
+}
+
+static int fs_info_num_to_save(void)
+{
+        const struct super_block *sb;
+        int to_save = 0;
+
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                struct fs_info *fs;
+
+                if (!sb->s_bdev)
+                        continue;
+
+                fs = fs_info_from_block_dev(sb->s_bdev);
+                if (save_fs_info(fs, sb->s_bdev))
+                        to_save++;
+                free_fs_info(fs);
+        }
+
+        return to_save;
+}
+
+static int fs_info_save(void)
+{
+        const struct super_block *sb;
+        int to_save = fs_info_num_to_save();
+
+        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save,
+                                sizeof(int))) {
+                abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info"
+                                " to save.");
+                return -EIO;
+        }
+
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                struct fs_info *fs;
+
+                if (!sb->s_bdev)
+                        continue;
+
+                fs = fs_info_from_block_dev(sb->s_bdev);
+                if (save_fs_info(fs, sb->s_bdev)) {
+                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+                                        &fs->uuid[0], 16)) {
+                                abort_hibernate(TOI_FAILED_IO, "Failed to "
+                                                "write uuid.");
+                                return -EIO;
+                        }
+                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+                                        (char *) &fs->dev_t, sizeof(dev_t))) {
+                                abort_hibernate(TOI_FAILED_IO, "Failed to "
+                                                "write dev_t.");
+                                return -EIO;
+                        }
+                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+                                        (char *) &fs->last_mount_size, sizeof(int))) {
+                                abort_hibernate(TOI_FAILED_IO, "Failed to "
+                                                "write last mount length.");
+                                return -EIO;
+                        }
+                        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+                                        fs->last_mount, fs->last_mount_size)) {
+                                abort_hibernate(TOI_FAILED_IO, "Failed to "
+                                                "write uuid.");
+                                return -EIO;
+                        }
+                }
+                free_fs_info(fs);
+        }
+        return 0;
+}
+
+static int fs_info_load_and_check_one(void)
+{
+        char uuid[16], *last_mount;
+        int result = 0, ln;
+        dev_t dev_t;
+        struct block_device *dev;
+        struct fs_info *fs_info, seek;
+
+        if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) {
+                abort_hibernate(TOI_FAILED_IO, "Failed to read uuid.");
+                return -EIO;
+        }
+
+        read_if_version(3, dev_t, "uuid dev_t field", return -EIO);
+
+        if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln,
+                                sizeof(int))) {
+                abort_hibernate(TOI_FAILED_IO,
+                                "Failed to read last mount size.");
+                return -EIO;
+        }
+
+        last_mount = kzalloc(ln, GFP_KERNEL);
+
+        if (!last_mount)
+                return -ENOMEM;
+
+        if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount,        ln)) {
+                abort_hibernate(TOI_FAILED_IO,
+                                "Failed to read last mount timestamp.");
+                result = -EIO;
+                goto out_lmt;
+        }
+
+        strncpy((char *) &seek.uuid, uuid, 16);
+        seek.dev_t = dev_t;
+        seek.last_mount_size = ln;
+        seek.last_mount = last_mount;
+        dev_t = blk_lookup_fs_info(&seek);
+        if (!dev_t)
+                goto out_lmt;
+
+        dev = toi_open_by_devnum(dev_t);
+
+        fs_info = fs_info_from_block_dev(dev);
+        if (fs_info && !IS_ERR(fs_info)) {
+                if (ln != fs_info->last_mount_size) {
+                        printk(KERN_EMERG "Found matching uuid but last mount "
+                                        "time lengths differ?! "
+                                        "(%d vs %d).\n", ln,
+                                        fs_info->last_mount_size);
+                        result = -EINVAL;
+                } else {
+                        char buf[BDEVNAME_SIZE];
+                        result = !!memcmp(fs_info->last_mount, last_mount, ln);
+                        if (result)
+                                printk(KERN_EMERG "Last mount time for %s has "
+                                        "changed!\n", bdevname(dev, buf));
+                }
+        }
+        toi_close_bdev(dev);
+        free_fs_info(fs_info);
+out_lmt:
+        kfree(last_mount);
+        return result;
+}
+
+static int fs_info_load_and_check(void)
+{
+        int to_do, result = 0;
+
+        if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do,
+                                sizeof(int))) {
+                abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info "
+                                "to load.");
+                return -EIO;
+        }
+
+        while(to_do--)
+                result |= fs_info_load_and_check_one();
+
+        return result;
+}
+
+/**
+ * write_image_header - write the image header after write the image proper
+ *
+ * Returns: Int
+ *        Zero on success, error value otherwise.
+ **/
+int write_image_header(void)
+{
+        int ret;
+        int total = pagedir1.size + pagedir2.size+2;
+        char *header_buffer = NULL;
+
+        /* Now prepare to write the header */
+        ret = toiActiveAllocator->write_header_init();
+        if (ret) {
+                abort_hibernate(TOI_FAILED_MODULE_INIT,
+                                "Active allocator's write_header_init"
+                                " function failed.");
+                goto write_image_header_abort;
+        }
+
+        /* Get a buffer */
+        header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP);
+        if (!header_buffer) {
+                abort_hibernate(TOI_OUT_OF_MEMORY,
+                        "Out of memory when trying to get page for header!");
+                goto write_image_header_abort;
+        }
+
+        /* Write hibernate header */
+        if (fill_toi_header((struct toi_header *) header_buffer)) {
+                abort_hibernate(TOI_OUT_OF_MEMORY,
+                        "Failure to fill header information!");
+                goto write_image_header_abort;
+        }
+
+        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+                        header_buffer, sizeof(struct toi_header))) {
+                abort_hibernate(TOI_OUT_OF_MEMORY,
+                        "Failure to write header info.");
+                goto write_image_header_abort;
+        }
+
+        if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+                        (char *) &toi_max_workers, sizeof(toi_max_workers))) {
+                abort_hibernate(TOI_OUT_OF_MEMORY,
+                        "Failure to number of workers to use.");
+                goto write_image_header_abort;
+        }
+
+        /* Write filesystem info */
+        if (fs_info_save())
+                goto write_image_header_abort;
+
+        /* Write module configurations */
+        ret = write_module_configs();
+        if (ret) {
+                abort_hibernate(TOI_FAILED_IO,
+                                "Failed to write module configs.");
+                goto write_image_header_abort;
+        }
+
+        if (memory_bm_write(pageset1_map,
+                                toiActiveAllocator->rw_header_chunk)) {
+                abort_hibernate(TOI_FAILED_IO,
+                                "Failed to write bitmaps.");
+                goto write_image_header_abort;
+        }
+
+        /* Flush data and let allocator cleanup */
+        if (toiActiveAllocator->write_header_cleanup()) {
+                abort_hibernate(TOI_FAILED_IO,
+                                "Failed to cleanup writing header.");
+                goto write_image_header_abort_no_cleanup;
+        }
+
+        if (test_result_state(TOI_ABORTED))
+                goto write_image_header_abort_no_cleanup;
+
+        toi_update_status(total, total, NULL);
+
+out:
+        if (header_buffer)
+                toi_free_page(24, (unsigned long) header_buffer);
+        return ret;
+
+write_image_header_abort:
+        toiActiveAllocator->write_header_cleanup();
+write_image_header_abort_no_cleanup:
+        ret = -1;
+        goto out;
+}
+
+/**
+ * sanity_check - check the header
+ * @sh:        the header which was saved at hibernate time.
+ *
+ * Perform a few checks, seeking to ensure that the kernel being
+ * booted matches the one hibernated. They need to match so we can
+ * be _sure_ things will work. It is not absolutely impossible for
+ * resuming from a different kernel to work, just not assured.
+ **/
+static char *sanity_check(struct toi_header *sh)
+{
+        char *reason = check_image_kernel((struct swsusp_info *) sh);
+
+        if (reason)
+                return reason;
+
+        if (!test_action_state(TOI_IGNORE_ROOTFS)) {
+                const struct super_block *sb;
+                list_for_each_entry(sb, &super_blocks, s_list) {
+                        if ((!(sb->s_flags & MS_RDONLY)) &&
+                            (sb->s_type->fs_flags & FS_REQUIRES_DEV))
+                                return "Device backed fs has been mounted "
+                                        "rw prior to resume or initrd/ramfs "
+                                        "is mounted rw.";
+                }
+        }
+
+        return NULL;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(freeze_wait);
+
+#define FREEZE_IN_PROGRESS (~0)
+
+static int freeze_result;
+
+static void do_freeze(struct work_struct *dummy)
+{
+        freeze_result = freeze_processes();
+        wake_up(&freeze_wait);
+        trap_non_toi_io = 1;
+}
+
+static DECLARE_WORK(freeze_work, do_freeze);
+
+/**
+ * __read_pageset1 - test for the existence of an image and attempt to load it
+ *
+ * Returns:        Int
+ *        Zero if image found and pageset1 successfully loaded.
+ *        Error if no image found or loaded.
+ **/
+static int __read_pageset1(void)
+{
+        int i, result = 0;
+        char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP),
+             *sanity_error = NULL;
+        struct toi_header *toi_header;
+
+        if (!header_buffer) {
+                printk(KERN_INFO "Unable to allocate a page for reading the "
+                                "signature.\n");
+                return -ENOMEM;
+        }
+
+        /* Check for an image */
+        result = toiActiveAllocator->image_exists(1);
+        if (result == 3) {
+                result = -ENODATA;
+                toi_early_boot_message(1, 0, "The signature from an older "
+                                "version of TuxOnIce has been detected.");
+                goto out_remove_image;
+        }
+
+        if (result != 1) {
+                result = -ENODATA;
+                noresume_reset_modules();
+                printk(KERN_INFO "TuxOnIce: No image found.\n");
+                goto out;
+        }
+
+        /*
+         * Prepare the active allocator for reading the image header. The
+         * activate allocator might read its own configuration.
+         *
+         * NB: This call may never return because there might be a signature
+         * for a different image such that we warn the user and they choose
+         * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
+         * location of the image might be unavailable if it was stored on a
+         * network connection).
+         */
+
+        result = toiActiveAllocator->read_header_init();
+        if (result) {
+                printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the "
+                                "image header.\n");
+                goto out_remove_image;
+        }
+
+        /* Check for noresume command line option */
+        if (test_toi_state(TOI_NORESUME_SPECIFIED)) {
+                printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed "
+                                "image.\n");
+                goto out_remove_image;
+        }
+
+        /* Check whether we've resumed before */
+        if (test_toi_state(TOI_RESUMED_BEFORE)) {
+                toi_early_boot_message(1, 0, NULL);
+                if (!(test_toi_state(TOI_CONTINUE_REQ))) {
+                        printk(KERN_INFO "TuxOnIce: Tried to resume before: "
+                                        "Invalidated image.\n");
+                        goto out_remove_image;
+                }
+        }
+
+        clear_toi_state(TOI_CONTINUE_REQ);
+
+        toi_image_header_version = toiActiveAllocator->get_header_version();
+
+        if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) {
+                toi_early_boot_message(1, 0, image_version_error);
+                if (!(test_toi_state(TOI_CONTINUE_REQ))) {
+                        printk(KERN_INFO "TuxOnIce: Header version too new: "
+                                        "Invalidated image.\n");
+                        goto out_remove_image;
+                }
+        }
+
+        /* Read hibernate header */
+        result = toiActiveAllocator->rw_header_chunk(READ, NULL,
+                        header_buffer, sizeof(struct toi_header));
+        if (result < 0) {
+                printk(KERN_ERR "TuxOnIce: Failed to read the image "
+                                "signature.\n");
+                goto out_remove_image;
+        }
+
+        toi_header = (struct toi_header *) header_buffer;
+
+        /*
+         * NB: This call may also result in a reboot rather than returning.
+         */
+
+        sanity_error = sanity_check(toi_header);
+        if (sanity_error) {
+                toi_early_boot_message(1, TOI_CONTINUE_REQ,
+                                sanity_error);
+                printk(KERN_INFO "TuxOnIce: Sanity check failed.\n");
+                goto out_remove_image;
+        }
+
+        /*
+         * We have an image and it looks like it will load okay.
+         *
+         * Get metadata from header. Don't override commandline parameters.
+         *
+         * We don't need to save the image size limit because it's not used
+         * during resume and will be restored with the image anyway.
+         */
+
+        memcpy((char *) &pagedir1,
+                (char *) &toi_header->pagedir, sizeof(pagedir1));
+        toi_result = toi_header->param0;
+        if (!toi_bkd.toi_debug_state) {
+                toi_bkd.toi_action =
+                        (toi_header->param1 & ~toi_bootflags_mask) |
+                        (toi_bkd.toi_action & toi_bootflags_mask);
+                toi_bkd.toi_debug_state = toi_header->param2;
+                toi_bkd.toi_default_console_level = toi_header->param3;
+        }
+        clear_toi_state(TOI_IGNORE_LOGLEVEL);
+        pagedir2.size = toi_header->pageset_2_size;
+        for (i = 0; i < 4; i++)
+                toi_bkd.toi_io_time[i/2][i%2] =
+                        toi_header->io_time[i/2][i%2];
+
+        set_toi_state(TOI_BOOT_KERNEL);
+        boot_kernel_data_buffer = toi_header->bkd;
+
+        read_if_version(1, toi_max_workers, "TuxOnIce max workers",
+                        goto out_remove_image);
+
+        /* Read filesystem info */
+        if (fs_info_load_and_check()) {
+                printk(KERN_EMERG "TuxOnIce: File system mount time checks "
+                        "failed. Refusing to corrupt your filesystems!\n");
+                goto out_remove_image;
+        }
+
+        /* Read module configurations */
+        result = read_module_configs();
+        if (result) {
+                pagedir1.size = 0;
+                pagedir2.size = 0;
+                printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module "
+                                "configurations.\n");
+                clear_action_state(TOI_KEEP_IMAGE);
+                goto out_remove_image;
+        }
+
+        toi_prepare_console();
+
+        set_toi_state(TOI_NOW_RESUMING);
+
+        result = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+        if (result)
+                goto out_notifier_call_chain;;
+
+        if (usermodehelper_disable())
+                goto out_enable_usermodehelper;
+
+        current->flags |= PF_NOFREEZE;
+        freeze_result = FREEZE_IN_PROGRESS;
+
+        schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work);
+
+        toi_cond_pause(1, "About to read original pageset1 locations.");
+
+        /*
+         * See _toi_rw_header_chunk in tuxonice_bio.c:
+         * Initialize pageset1_map by reading the map from the image.
+         */
+        if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk))
+                goto out_thaw;
+
+        /*
+         * See toi_rw_cleanup in tuxonice_bio.c:
+         * Clean up after reading the header.
+         */
+        result = toiActiveAllocator->read_header_cleanup();
+        if (result) {
+                printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the "
+                                "image header.\n");
+                goto out_thaw;
+        }
+
+        toi_cond_pause(1, "About to read pagedir.");
+
+        /*
+         * Get the addresses of pages into which we will load the kernel to
+         * be copied back and check if they conflict with the ones we are using.
+         */
+        if (toi_get_pageset1_load_addresses()) {
+                printk(KERN_INFO "TuxOnIce: Failed to get load addresses for "
+                                "pageset1.\n");
+                goto out_thaw;
+        }
+
+        /* Read the original kernel back */
+        toi_cond_pause(1, "About to read pageset 1.");
+
+        /* Given the pagemap, read back the data from disk */
+        if (read_pageset(&pagedir1, 0)) {
+                toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1.");
+                result = -EIO;
+                goto out_thaw;
+        }
+
+        toi_cond_pause(1, "About to restore original kernel.");
+        result = 0;
+
+        if (!toi_keeping_image &&
+            toiActiveAllocator->mark_resume_attempted)
+                toiActiveAllocator->mark_resume_attempted(1);
+
+        wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
+out:
+        current->flags &= ~PF_NOFREEZE;
+        toi_free_page(25, (unsigned long) header_buffer);
+        return result;
+
+out_thaw:
+        wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
+        trap_non_toi_io = 0;
+        thaw_processes();
+out_enable_usermodehelper:
+        usermodehelper_enable();
+out_notifier_call_chain:
+        pm_notifier_call_chain(PM_POST_RESTORE);
+        toi_cleanup_console();
+out_remove_image:
+        result = -EINVAL;
+        if (!toi_keeping_image)
+                toiActiveAllocator->remove_image();
+        toiActiveAllocator->read_header_cleanup();
+        noresume_reset_modules();
+        goto out;
+}
+
+/**
+ * read_pageset1 - highlevel function to read the saved pages
+ *
+ * Attempt to read the header and pageset1 of a hibernate image.
+ * Handle the outcome, complaining where appropriate.
+ **/
+int read_pageset1(void)
+{
+        int error;
+
+        error = __read_pageset1();
+
+        if (error && error != -ENODATA && error != -EINVAL &&
+                                        !test_result_state(TOI_ABORTED))
+                abort_hibernate(TOI_IMAGE_ERROR,
+                        "TuxOnIce: Error %d resuming\n", error);
+
+        return error;
+}
+
+/**
+ * get_have_image_data - check the image header
+ **/
+static char *get_have_image_data(void)
+{
+        char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP);
+        struct toi_header *toi_header;
+
+        if (!output_buffer) {
+                printk(KERN_INFO "Output buffer null.\n");
+                return NULL;
+        }
+
+        /* Check for an image */
+        if (!toiActiveAllocator->image_exists(1) ||
+            toiActiveAllocator->read_header_init() ||
+            toiActiveAllocator->rw_header_chunk(READ, NULL,
+                        output_buffer, sizeof(struct toi_header))) {
+                sprintf(output_buffer, "0\n");
+                /*
+                 * From an initrd/ramfs, catting have_image and
+                 * getting a result of 0 is sufficient.
+                 */
+                clear_toi_state(TOI_BOOT_TIME);
+                goto out;
+        }
+
+        toi_header = (struct toi_header *) output_buffer;
+
+        sprintf(output_buffer, "1\n%s\n%s\n",
+                        toi_header->uts.machine,
+                        toi_header->uts.version);
+
+        /* Check whether we've resumed before */
+        if (test_toi_state(TOI_RESUMED_BEFORE))
+                strcat(output_buffer, "Resumed before.\n");
+
+out:
+        noresume_reset_modules();
+        return output_buffer;
+}
+
+/**
+ * read_pageset2 - read second part of the image
+ * @overwrittenpagesonly:        Read only pages which would have been
+ *                                verwritten by pageset1?
+ *
+ * Read in part or all of pageset2 of an image, depending upon
+ * whether we are hibernating and have only overwritten a portion
+ * with pageset1 pages, or are resuming and need to read them
+ * all.
+ *
+ * Returns: Int
+ *        Zero if no error, otherwise the error value.
+ **/
+int read_pageset2(int overwrittenpagesonly)
+{
+        int result = 0;
+
+        if (!pagedir2.size)
+                return 0;
+
+        result = read_pageset(&pagedir2, overwrittenpagesonly);
+
+        toi_cond_pause(1, "Pagedir 2 read.");
+
+        return result;
+}
+
+/**
+ * image_exists_read - has an image been found?
+ * @page:        Output buffer
+ *
+ * Store 0 or 1 in page, depending on whether an image is found.
+ * Incoming buffer is PAGE_SIZE and result is guaranteed
+ * to be far less than that, so we don't worry about
+ * overflow.
+ **/
+int image_exists_read(const char *page, int count)
+{
+        int len = 0;
+        char *result;
+
+        if (toi_activate_storage(0))
+                return count;
+
+        if (!test_toi_state(TOI_RESUME_DEVICE_OK))
+                toi_attempt_to_parse_resume_device(0);
+
+        if (!toiActiveAllocator) {
+                len = sprintf((char *) page, "-1\n");
+        } else {
+                result = get_have_image_data();
+                if (result) {
+                        len = sprintf((char *) page, "%s",  result);
+                        toi_free_page(26, (unsigned long) result);
+                }
+        }
+
+        toi_deactivate_storage(0);
+
+        return len;
+}
+
+/**
+ * image_exists_write - invalidate an image if one exists
+ **/
+int image_exists_write(const char *buffer, int count)
+{
+        if (toi_activate_storage(0))
+                return count;
+
+        if (toiActiveAllocator && toiActiveAllocator->image_exists(1))
+                toiActiveAllocator->remove_image();
+
+        toi_deactivate_storage(0);
+
+        clear_result_state(TOI_KEPT_IMAGE);
+
+        return count;
+}
diff --git b/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h
new file mode 100644
index 0000000..7d4d83f
--- /dev/null
+++ b/kernel/power/tuxonice_io.h
@@ -0,0 +1,72 @@
+/*
+ * kernel/power/tuxonice_io.h
+ *
+ * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains high level IO routines for hibernating.
+ *
+ */
+
+#include <linux/utsname.h>
+#include "tuxonice_pagedir.h"
+
+/* Non-module data saved in our image header */
+struct toi_header {
+        /*
+         * Mirror struct swsusp_info, but without
+         * the page aligned attribute
+         */
+        struct new_utsname uts;
+        u32 version_code;
+        unsigned long num_physpages;
+        int cpus;
+        unsigned long image_pages;
+        unsigned long pages;
+        unsigned long size;
+
+        /* Our own data */
+        unsigned long orig_mem_free;
+        int page_size;
+        int pageset_2_size;
+        int param0;
+        int param1;
+        int param2;
+        int param3;
+        int progress0;
+        int progress1;
+        int progress2;
+        int progress3;
+        int io_time[2][2];
+        struct pagedir pagedir;
+        dev_t root_fs;
+        unsigned long bkd; /* Boot kernel data locn */
+};
+
+extern int write_pageset(struct pagedir *pagedir);
+extern int write_image_header(void);
+extern int read_pageset1(void);
+extern int read_pageset2(int overwrittenpagesonly);
+
+extern int toi_attempt_to_parse_resume_device(int quiet);
+extern void attempt_to_parse_resume_device2(void);
+extern void attempt_to_parse_alt_resume_param(void);
+int image_exists_read(const char *page, int count);
+int image_exists_write(const char *buffer, int count);
+extern void save_restore_alt_param(int replace, int quiet);
+extern atomic_t toi_io_workers;
+
+/* Args to save_restore_alt_param */
+#define RESTORE 0
+#define SAVE 1
+
+#define NOQUIET 0
+#define QUIET 1
+
+extern wait_queue_head_t toi_io_queue_flusher;
+extern int toi_bio_queue_flusher_should_finish;
+
+int fs_info_space_needed(int reset);
+
+extern int toi_max_workers;
diff --git b/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c
new file mode 100644
index 0000000..a203c8f
--- /dev/null
+++ b/kernel/power/tuxonice_modules.c
@@ -0,0 +1,520 @@
+/*
+ * kernel/power/tuxonice_modules.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_ui.h"
+
+LIST_HEAD(toi_filters);
+LIST_HEAD(toiAllocators);
+
+LIST_HEAD(toi_modules);
+
+struct toi_module_ops *toiActiveAllocator;
+
+static int toi_num_filters;
+int toiNumAllocators, toi_num_modules;
+
+/*
+ * toi_header_storage_for_modules
+ *
+ * Returns the amount of space needed to store configuration
+ * data needed by the modules prior to copying back the original
+ * kernel. We can exclude data for pageset2 because it will be
+ * available anyway once the kernel is copied back.
+ */
+long toi_header_storage_for_modules(void)
+{
+        struct toi_module_ops *this_module;
+        int bytes = 0;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!this_module->enabled ||
+                    (this_module->type == WRITER_MODULE &&
+                     toiActiveAllocator != this_module))
+                        continue;
+                if (this_module->storage_needed) {
+                        int this = this_module->storage_needed() +
+                                sizeof(struct toi_module_header) +
+                                sizeof(int);
+                        this_module->header_requested = this;
+                        bytes += this;
+                }
+        }
+
+        /* One more for the empty terminator */
+        return bytes + sizeof(struct toi_module_header);
+}
+
+void print_toi_header_storage_for_modules(void)
+{
+        struct toi_module_ops *this_module;
+        int bytes = 0;
+
+        printk(KERN_DEBUG "Header storage:\n");
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!this_module->enabled ||
+                    (this_module->type == WRITER_MODULE &&
+                     toiActiveAllocator != this_module))
+                        continue;
+                if (this_module->storage_needed) {
+                        int this = this_module->storage_needed() +
+                                sizeof(struct toi_module_header) +
+                                sizeof(int);
+                        this_module->header_requested = this;
+                        bytes += this;
+                        printk(KERN_DEBUG "+ %16s : %-4d/%d.\n",
+                                        this_module->name,
+                                        this_module->header_used, this);
+                }
+        }
+
+        printk(KERN_DEBUG "+ empty terminator : %zu.\n",
+                        sizeof(struct toi_module_header));
+        printk(KERN_DEBUG "                     ====\n");
+        printk(KERN_DEBUG "                     %zu\n",
+                        bytes + sizeof(struct toi_module_header));
+}
+
+/*
+ * toi_memory_for_modules
+ *
+ * Returns the amount of memory requested by modules for
+ * doing their work during the cycle.
+ */
+
+long toi_memory_for_modules(int print_parts)
+{
+        long bytes = 0, result;
+        struct toi_module_ops *this_module;
+
+        if (print_parts)
+                printk(KERN_INFO "Memory for modules:\n===================\n");
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                int this;
+                if (!this_module->enabled)
+                        continue;
+                if (this_module->memory_needed) {
+                        this = this_module->memory_needed();
+                        if (print_parts)
+                                printk(KERN_INFO "%10d bytes (%5ld pages) for "
+                                                "module '%s'.\n", this,
+                                                DIV_ROUND_UP(this, PAGE_SIZE),
+                                                this_module->name);
+                        bytes += this;
+                }
+        }
+
+        result = DIV_ROUND_UP(bytes, PAGE_SIZE);
+        if (print_parts)
+                printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result);
+
+        return result;
+}
+
+/*
+ * toi_expected_compression_ratio
+ *
+ * Returns the compression ratio expected when saving the image.
+ */
+
+int toi_expected_compression_ratio(void)
+{
+        int ratio = 100;
+        struct toi_module_ops *this_module;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!this_module->enabled)
+                        continue;
+                if (this_module->expected_compression)
+                        ratio = ratio * this_module->expected_compression()
+                                / 100;
+        }
+
+        return ratio;
+}
+
+/* toi_find_module_given_dir
+ * Functionality :        Return a module (if found), given a pointer
+ *                         to its directory name
+ */
+
+static struct toi_module_ops *toi_find_module_given_dir(char *name)
+{
+        struct toi_module_ops *this_module, *found_module = NULL;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!strcmp(name, this_module->directory)) {
+                        found_module = this_module;
+                        break;
+                }
+        }
+
+        return found_module;
+}
+
+/* toi_find_module_given_name
+ * Functionality :        Return a module (if found), given a pointer
+ *                         to its name
+ */
+
+struct toi_module_ops *toi_find_module_given_name(char *name)
+{
+        struct toi_module_ops *this_module, *found_module = NULL;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!strcmp(name, this_module->name)) {
+                        found_module = this_module;
+                        break;
+                }
+        }
+
+        return found_module;
+}
+
+/*
+ * toi_print_module_debug_info
+ * Functionality   : Get debugging info from modules into a buffer.
+ */
+int toi_print_module_debug_info(char *buffer, int buffer_size)
+{
+        struct toi_module_ops *this_module;
+        int len = 0;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!this_module->enabled)
+                        continue;
+                if (this_module->print_debug_info) {
+                        int result;
+                        result = this_module->print_debug_info(buffer + len,
+                                        buffer_size - len);
+                        len += result;
+                }
+        }
+
+        /* Ensure null terminated */
+        buffer[buffer_size] = 0;
+
+        return len;
+}
+
+/*
+ * toi_register_module
+ *
+ * Register a module.
+ */
+int toi_register_module(struct toi_module_ops *module)
+{
+        int i;
+        struct kobject *kobj;
+
+        if (!hibernation_available())
+          return -ENODEV;
+
+        module->enabled = 1;
+
+        if (toi_find_module_given_name(module->name)) {
+                printk(KERN_INFO "TuxOnIce: Trying to load module %s,"
+                                " which is already registered.\n",
+                                module->name);
+                return -EBUSY;
+        }
+
+        switch (module->type) {
+        case FILTER_MODULE:
+                list_add_tail(&module->type_list, &toi_filters);
+                toi_num_filters++;
+                break;
+        case WRITER_MODULE:
+                list_add_tail(&module->type_list, &toiAllocators);
+                toiNumAllocators++;
+                break;
+        case MISC_MODULE:
+        case MISC_HIDDEN_MODULE:
+        case BIO_ALLOCATOR_MODULE:
+                break;
+        default:
+                printk(KERN_ERR "Hmmm. Module '%s' has an invalid type."
+                        " It has been ignored.\n", module->name);
+                return -EINVAL;
+        }
+        list_add_tail(&module->module_list, &toi_modules);
+        toi_num_modules++;
+
+        if ((!module->directory && !module->shared_directory) ||
+                        !module->sysfs_data || !module->num_sysfs_entries)
+                return 0;
+
+        /*
+         * Modules may share a directory, but those with shared_dir
+         * set must be loaded (via symbol dependencies) after parents
+         * and unloaded beforehand.
+         */
+        if (module->shared_directory) {
+                struct toi_module_ops *shared =
+                        toi_find_module_given_dir(module->shared_directory);
+                if (!shared) {
+                        printk(KERN_ERR "TuxOnIce: Module %s wants to share "
+                                        "%s's directory but %s isn't loaded.\n",
+                                        module->name, module->shared_directory,
+                                        module->shared_directory);
+                        toi_unregister_module(module);
+                        return -ENODEV;
+                }
+                kobj = shared->dir_kobj;
+        } else {
+                if (!strncmp(module->directory, "[ROOT]", 6))
+                        kobj = tuxonice_kobj;
+                else
+                        kobj = make_toi_sysdir(module->directory);
+        }
+        module->dir_kobj = kobj;
+        for (i = 0; i < module->num_sysfs_entries; i++) {
+                int result = toi_register_sysfs_file(kobj,
+                                &module->sysfs_data[i]);
+                if (result)
+                        return result;
+        }
+        return 0;
+}
+
+/*
+ * toi_unregister_module
+ *
+ * Remove a module.
+ */
+void toi_unregister_module(struct toi_module_ops *module)
+{
+        int i;
+
+        if (module->dir_kobj)
+                for (i = 0; i < module->num_sysfs_entries; i++)
+                        toi_unregister_sysfs_file(module->dir_kobj,
+                                        &module->sysfs_data[i]);
+
+        if (!module->shared_directory && module->directory &&
+                        strncmp(module->directory, "[ROOT]", 6))
+                remove_toi_sysdir(module->dir_kobj);
+
+        switch (module->type) {
+        case FILTER_MODULE:
+                list_del(&module->type_list);
+                toi_num_filters--;
+                break;
+        case WRITER_MODULE:
+                list_del(&module->type_list);
+                toiNumAllocators--;
+                if (toiActiveAllocator == module) {
+                        toiActiveAllocator = NULL;
+                        clear_toi_state(TOI_CAN_RESUME);
+                        clear_toi_state(TOI_CAN_HIBERNATE);
+                }
+                break;
+        case MISC_MODULE:
+        case MISC_HIDDEN_MODULE:
+        case BIO_ALLOCATOR_MODULE:
+                break;
+        default:
+                printk(KERN_ERR "Module '%s' has an invalid type."
+                        " It has been ignored.\n", module->name);
+                return;
+        }
+        list_del(&module->module_list);
+        toi_num_modules--;
+}
+
+/*
+ * toi_move_module_tail
+ *
+ * Rearrange modules when reloading the config.
+ */
+void toi_move_module_tail(struct toi_module_ops *module)
+{
+        switch (module->type) {
+        case FILTER_MODULE:
+                if (toi_num_filters > 1)
+                        list_move_tail(&module->type_list, &toi_filters);
+                break;
+        case WRITER_MODULE:
+                if (toiNumAllocators > 1)
+                        list_move_tail(&module->type_list, &toiAllocators);
+                break;
+        case MISC_MODULE:
+        case MISC_HIDDEN_MODULE:
+        case BIO_ALLOCATOR_MODULE:
+                break;
+        default:
+                printk(KERN_ERR "Module '%s' has an invalid type."
+                        " It has been ignored.\n", module->name);
+                return;
+        }
+        if ((toi_num_filters + toiNumAllocators) > 1)
+                list_move_tail(&module->module_list, &toi_modules);
+}
+
+/*
+ * toi_initialise_modules
+ *
+ * Get ready to do some work!
+ */
+int toi_initialise_modules(int starting_cycle, int early)
+{
+        struct toi_module_ops *this_module;
+        int result;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                this_module->header_requested = 0;
+                this_module->header_used = 0;
+                if (!this_module->enabled)
+                        continue;
+                if (this_module->early != early)
+                        continue;
+                if (this_module->initialise) {
+                        result = this_module->initialise(starting_cycle);
+                        if (result) {
+                                toi_cleanup_modules(starting_cycle);
+                                return result;
+                        }
+                        this_module->initialised = 1;
+                }
+        }
+
+        return 0;
+}
+
+/*
+ * toi_cleanup_modules
+ *
+ * Tell modules the work is done.
+ */
+void toi_cleanup_modules(int finishing_cycle)
+{
+        struct toi_module_ops *this_module;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (!this_module->enabled || !this_module->initialised)
+                        continue;
+                if (this_module->cleanup)
+                        this_module->cleanup(finishing_cycle);
+                this_module->initialised = 0;
+        }
+}
+
+/*
+ * toi_pre_atomic_restore_modules
+ *
+ * Get ready to do some work!
+ */
+void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
+{
+        struct toi_module_ops *this_module;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (this_module->enabled && this_module->pre_atomic_restore)
+                        this_module->pre_atomic_restore(bkd);
+        }
+}
+
+/*
+ * toi_post_atomic_restore_modules
+ *
+ * Get ready to do some work!
+ */
+void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
+{
+        struct toi_module_ops *this_module;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (this_module->enabled && this_module->post_atomic_restore)
+                        this_module->post_atomic_restore(bkd);
+        }
+}
+
+/*
+ * toi_get_next_filter
+ *
+ * Get the next filter in the pipeline.
+ */
+struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought)
+{
+        struct toi_module_ops *last_filter = NULL, *this_filter = NULL;
+
+        list_for_each_entry(this_filter, &toi_filters, type_list) {
+                if (!this_filter->enabled)
+                        continue;
+                if ((last_filter == filter_sought) || (!filter_sought))
+                        return this_filter;
+                last_filter = this_filter;
+        }
+
+        return toiActiveAllocator;
+}
+
+/**
+ * toi_show_modules: Printk what support is loaded.
+ */
+void toi_print_modules(void)
+{
+        struct toi_module_ops *this_module;
+        int prev = 0;
+
+        printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for");
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                if (this_module->type == MISC_HIDDEN_MODULE)
+                        continue;
+                printk("%s %s%s%s", prev ? "," : "",
+                                this_module->enabled ? "" : "[",
+                                this_module->name,
+                                this_module->enabled ? "" : "]");
+                prev = 1;
+        }
+
+        printk(".\n");
+}
+
+/* toi_get_modules
+ *
+ * Take a reference to modules so they can't go away under us.
+ */
+
+int toi_get_modules(void)
+{
+        struct toi_module_ops *this_module;
+
+        list_for_each_entry(this_module, &toi_modules, module_list) {
+                struct toi_module_ops *this_module2;
+
+                if (try_module_get(this_module->module))
+                        continue;
+
+                /* Failed! Reverse gets and return error */
+                list_for_each_entry(this_module2, &toi_modules,
+                                module_list) {
+                        if (this_module == this_module2)
+                                return -EINVAL;
+                        module_put(this_module2->module);
+                }
+        }
+        return 0;
+}
+
+/* toi_put_modules
+ *
+ * Release our references to modules we used.
+ */
+
+void toi_put_modules(void)
+{
+        struct toi_module_ops *this_module;
+
+        list_for_each_entry(this_module, &toi_modules, module_list)
+                module_put(this_module->module);
+}
diff --git b/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h
new file mode 100644
index 0000000..44f10ab
--- /dev/null
+++ b/kernel/power/tuxonice_modules.h
@@ -0,0 +1,212 @@
+/*
+ * kernel/power/tuxonice_modules.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains declarations for modules. Modules are additions to
+ * TuxOnIce that provide facilities such as image compression or
+ * encryption, backends for storage of the image and user interfaces.
+ *
+ */
+
+#ifndef TOI_MODULES_H
+#define TOI_MODULES_H
+
+/* This is the maximum size we store in the image header for a module name */
+#define TOI_MAX_MODULE_NAME_LENGTH 30
+
+struct toi_boot_kernel_data;
+
+/* Per-module metadata */
+struct toi_module_header {
+        char name[TOI_MAX_MODULE_NAME_LENGTH];
+        int enabled;
+        int type;
+        int index;
+        int data_length;
+        unsigned long signature;
+};
+
+enum {
+        FILTER_MODULE,
+        WRITER_MODULE,
+        BIO_ALLOCATOR_MODULE,
+        MISC_MODULE,
+        MISC_HIDDEN_MODULE,
+};
+
+enum {
+        TOI_ASYNC,
+        TOI_SYNC
+};
+
+enum {
+        TOI_VIRT,
+        TOI_PAGE,
+};
+
+#define TOI_MAP(type, addr) \
+ (type == TOI_PAGE ? kmap(addr) : addr)
+
+#define TOI_UNMAP(type, addr) \
+ do { \
+   if (type == TOI_PAGE) \
+     kunmap(addr); \
+ } while(0)
+
+struct toi_module_ops {
+        /* Functions common to all modules */
+        int type;
+        char *name;
+        char *directory;
+        char *shared_directory;
+        struct kobject *dir_kobj;
+        struct module *module;
+        int enabled, early, initialised;
+        struct list_head module_list;
+
+        /* List of filters or allocators */
+        struct list_head list, type_list;
+
+        /*
+         * Requirements for memory and storage in
+         * the image header..
+         */
+        int (*memory_needed) (void);
+        int (*storage_needed) (void);
+
+        int header_requested, header_used;
+
+        int (*expected_compression) (void);
+
+        /*
+         * Debug info
+         */
+        int (*print_debug_info) (char *buffer, int size);
+        int (*save_config_info) (char *buffer);
+        void (*load_config_info) (char *buffer, int len);
+
+        /*
+         * Initialise & cleanup - general routines called
+         * at the start and end of a cycle.
+         */
+        int (*initialise) (int starting_cycle);
+        void (*cleanup) (int finishing_cycle);
+
+        void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd);
+        void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd);
+
+        /*
+         * Calls for allocating storage (allocators only).
+         *
+         * Header space is requested separately and cannot fail, but the
+         * reservation is only applied when main storage is allocated.
+         * The header space reservation is thus always set prior to
+         * requesting the allocation of storage - and prior to querying
+         * how much storage is available.
+         */
+
+        unsigned long (*storage_available) (void);
+        void (*reserve_header_space) (unsigned long space_requested);
+        int (*register_storage) (void);
+        int (*allocate_storage) (unsigned long space_requested);
+        unsigned long (*storage_allocated) (void);
+        void (*free_unused_storage) (void);
+
+        /*
+         * Routines used in image I/O.
+         */
+        int (*rw_init) (int rw, int stream_number);
+        int (*rw_cleanup) (int rw);
+        int (*write_page) (unsigned long index, int buf_type, void *buf,
+                        unsigned int buf_size);
+        int (*read_page) (unsigned long *index, int buf_type, void *buf,
+                        unsigned int *buf_size);
+        int (*io_flusher) (int rw);
+
+        /* Reset module if image exists but reading aborted */
+        void (*noresume_reset) (void);
+
+        /* Read and write the metadata */
+        int (*write_header_init) (void);
+        int (*write_header_cleanup) (void);
+
+        int (*read_header_init) (void);
+        int (*read_header_cleanup) (void);
+
+        /* To be called after read_header_init */
+        int (*get_header_version) (void);
+
+        int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
+                        char *buffer_start, int buffer_size);
+
+        int (*rw_header_chunk_noreadahead) (int rw,
+                        struct toi_module_ops *owner, char *buffer_start,
+                        int buffer_size);
+
+        /* Attempt to parse an image location */
+        int (*parse_sig_location) (char *buffer, int only_writer, int quiet);
+
+        /* Throttle I/O according to throughput */
+        void (*update_throughput_throttle) (int jif_index);
+
+        /* Flush outstanding I/O */
+        int (*finish_all_io) (void);
+
+        /* Determine whether image exists that we can restore */
+        int (*image_exists) (int quiet);
+
+        /* Mark the image as having tried to resume */
+        int (*mark_resume_attempted) (int);
+
+        /* Destroy image if one exists */
+        int (*remove_image) (void);
+
+        /* Sysfs Data */
+        struct toi_sysfs_data *sysfs_data;
+        int num_sysfs_entries;
+
+        /* Block I/O allocator */
+        struct toi_bio_allocator_ops *bio_allocator_ops;
+};
+
+extern int toi_num_modules, toiNumAllocators;
+
+extern struct toi_module_ops *toiActiveAllocator;
+extern struct list_head toi_filters, toiAllocators, toi_modules;
+
+extern void toi_prepare_console_modules(void);
+extern void toi_cleanup_console_modules(void);
+
+extern struct toi_module_ops *toi_find_module_given_name(char *name);
+extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *);
+
+extern int toi_register_module(struct toi_module_ops *module);
+extern void toi_move_module_tail(struct toi_module_ops *module);
+
+extern long toi_header_storage_for_modules(void);
+extern long toi_memory_for_modules(int print_parts);
+extern void print_toi_header_storage_for_modules(void);
+extern int toi_expected_compression_ratio(void);
+
+extern int toi_print_module_debug_info(char *buffer, int buffer_size);
+extern int toi_register_module(struct toi_module_ops *module);
+extern void toi_unregister_module(struct toi_module_ops *module);
+
+extern int toi_initialise_modules(int starting_cycle, int early);
+#define toi_initialise_modules_early(starting) \
+        toi_initialise_modules(starting, 1)
+#define toi_initialise_modules_late(starting) \
+        toi_initialise_modules(starting, 0)
+extern void toi_cleanup_modules(int finishing_cycle);
+
+extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
+extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
+
+extern void toi_print_modules(void);
+
+int toi_get_modules(void);
+void toi_put_modules(void);
+#endif
diff --git b/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
new file mode 100644
index 0000000..78bd31b
--- /dev/null
+++ b/kernel/power/tuxonice_netlink.c
@@ -0,0 +1,324 @@
+/*
+ * kernel/power/tuxonice_netlink.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Functions for communicating with a userspace helper via netlink.
+ */
+
+#include <linux/suspend.h>
+#include <linux/sched.h>
+#include <linux/kmod.h>
+#include "tuxonice_netlink.h"
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_builtin.h"
+
+static struct user_helper_data *uhd_list;
+
+/*
+ * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
+ * none can be allocated).
+ */
+static void toi_fill_skb_pool(struct user_helper_data *uhd)
+{
+        while (uhd->pool_level < uhd->pool_limit) {
+                struct sk_buff *new_skb =
+                        alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
+
+                if (!new_skb)
+                        break;
+
+                new_skb->next = uhd->emerg_skbs;
+                uhd->emerg_skbs = new_skb;
+                uhd->pool_level++;
+        }
+}
+
+/*
+ * Try to allocate a single skb. If we can't get one, try to use one from
+ * our pool.
+ */
+static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
+{
+        struct sk_buff *skb =
+                alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
+
+        if (skb)
+                return skb;
+
+        skb = uhd->emerg_skbs;
+        if (skb) {
+                uhd->pool_level--;
+                uhd->emerg_skbs = skb->next;
+                skb->next = NULL;
+        }
+
+        return skb;
+}
+
+void toi_send_netlink_message(struct user_helper_data *uhd,
+                int type, void *params, size_t len)
+{
+        struct sk_buff *skb;
+        struct nlmsghdr *nlh;
+        void *dest;
+        struct task_struct *t;
+
+        if (uhd->pid == -1)
+                return;
+
+        if (uhd->debug)
+                printk(KERN_ERR "toi_send_netlink_message: Send "
+                                "message type %d.\n", type);
+
+        skb = toi_get_skb(uhd);
+        if (!skb) {
+                printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
+                return;
+        }
+
+        nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0);
+        uhd->sock_seq++;
+
+        dest = NLMSG_DATA(nlh);
+        if (params && len > 0)
+                memcpy(dest, params, len);
+
+        netlink_unicast(uhd->nl, skb, uhd->pid, 0);
+
+        toi_read_lock_tasklist();
+        t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
+        if (!t) {
+                toi_read_unlock_tasklist();
+                if (uhd->pid > -1)
+                        printk(KERN_INFO "Hmm. Can't find the userspace task"
+                                " %d.\n", uhd->pid);
+                return;
+        }
+        wake_up_process(t);
+        toi_read_unlock_tasklist();
+
+        yield();
+}
+
+static void send_whether_debugging(struct user_helper_data *uhd)
+{
+        static u8 is_debugging = 1;
+
+        toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
+                        &is_debugging, sizeof(u8));
+}
+
+/*
+ * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
+ * are hibernating.
+ */
+static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid)
+{
+        struct task_struct *t;
+
+        if (uhd->debug)
+                printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid);
+
+        toi_read_lock_tasklist();
+        t = find_task_by_pid_ns(pid, &init_pid_ns);
+        if (!t) {
+                toi_read_unlock_tasklist();
+                printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
+                                pid);
+                return -EINVAL;
+        }
+
+        t->flags |= PF_NOFREEZE;
+
+        toi_read_unlock_tasklist();
+        uhd->pid = pid;
+
+        toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
+
+        return 0;
+}
+
+/*
+ * Called when the userspace process has informed us that it's ready to roll.
+ */
+static int nl_ready(struct user_helper_data *uhd, u32 version)
+{
+        if (version != uhd->interface_version) {
+                printk(KERN_INFO "%s userspace process using invalid interface"
+                                " version (%d - kernel wants %d). Trying to "
+                                "continue without it.\n",
+                                uhd->name, version, uhd->interface_version);
+                if (uhd->not_ready)
+                        uhd->not_ready();
+                return -EINVAL;
+        }
+
+        complete(&uhd->wait_for_process);
+
+        return 0;
+}
+
+void toi_netlink_close_complete(struct user_helper_data *uhd)
+{
+        if (uhd->nl) {
+                netlink_kernel_release(uhd->nl);
+                uhd->nl = NULL;
+        }
+
+        while (uhd->emerg_skbs) {
+                struct sk_buff *next = uhd->emerg_skbs->next;
+                kfree_skb(uhd->emerg_skbs);
+                uhd->emerg_skbs = next;
+        }
+
+        uhd->pid = -1;
+}
+
+static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
+                struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+        int type = nlh->nlmsg_type;
+        int *data;
+        int err;
+
+        if (uhd->debug)
+                printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n",
+                                type);
+
+        /* Let the more specific handler go first. It returns
+         * 1 for valid messages that it doesn't know. */
+        err = uhd->rcv_msg(skb, nlh);
+        if (err != 1)
+                return err;
+
+        /* Only allow one task to receive NOFREEZE privileges */
+        if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
+                printk(KERN_INFO "Received extra nofreeze me requests.\n");
+                return -EBUSY;
+        }
+
+        data = NLMSG_DATA(nlh);
+
+        switch (type) {
+        case NETLINK_MSG_NOFREEZE_ME:
+                return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
+        case NETLINK_MSG_GET_DEBUGGING:
+                send_whether_debugging(uhd);
+                return 0;
+        case NETLINK_MSG_READY:
+                if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) {
+                        printk(KERN_INFO "Invalid ready mesage.\n");
+                        if (uhd->not_ready)
+                                uhd->not_ready();
+                        return -EINVAL;
+                }
+                return nl_ready(uhd, (u32) *data);
+        case NETLINK_MSG_CLEANUP:
+                toi_netlink_close_complete(uhd);
+                return 0;
+        }
+
+        return -EINVAL;
+}
+
+static void toi_user_rcv_skb(struct sk_buff *skb)
+{
+        int err;
+        struct nlmsghdr *nlh;
+        struct user_helper_data *uhd = uhd_list;
+
+        while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
+                uhd = uhd->next;
+
+        if (!uhd)
+                return;
+
+        while (skb->len >= NLMSG_SPACE(0)) {
+                u32 rlen;
+
+                nlh = (struct nlmsghdr *) skb->data;
+                if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+                        return;
+
+                rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+                if (rlen > skb->len)
+                        rlen = skb->len;
+
+                err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
+                if (err)
+                        netlink_ack(skb, nlh, err);
+                else if (nlh->nlmsg_flags & NLM_F_ACK)
+                        netlink_ack(skb, nlh, 0);
+                skb_pull(skb, rlen);
+        }
+}
+
+static int netlink_prepare(struct user_helper_data *uhd)
+{
+        struct netlink_kernel_cfg cfg = {
+                .groups = 0,
+                .input = toi_user_rcv_skb,
+        };
+
+        uhd->next = uhd_list;
+        uhd_list = uhd;
+
+        uhd->sock_seq = 0x42c0ffee;
+        uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg);
+        if (!uhd->nl) {
+                printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
+                                uhd->name);
+                return -ENOMEM;
+        }
+
+        toi_fill_skb_pool(uhd);
+
+        return 0;
+}
+
+void toi_netlink_close(struct user_helper_data *uhd)
+{
+        struct task_struct *t;
+
+        toi_read_lock_tasklist();
+        t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
+        if (t)
+                t->flags &= ~PF_NOFREEZE;
+        toi_read_unlock_tasklist();
+
+        toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
+}
+int toi_netlink_setup(struct user_helper_data *uhd)
+{
+        /* In case userui didn't cleanup properly on us */
+        toi_netlink_close_complete(uhd);
+
+        if (netlink_prepare(uhd) < 0) {
+                printk(KERN_INFO "Netlink prepare failed.\n");
+                return 1;
+        }
+
+        if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
+                                UMH_WAIT_EXEC, uhd->debug) < 0) {
+                printk(KERN_INFO "Launch userspace program failed.\n");
+                toi_netlink_close_complete(uhd);
+                return 1;
+        }
+
+        /* Wait 2 seconds for the userspace process to make contact */
+        wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
+
+        if (uhd->pid == -1) {
+                printk(KERN_INFO "%s: Failed to contact userspace process.\n",
+                                uhd->name);
+                toi_netlink_close_complete(uhd);
+                return 1;
+        }
+
+        return 0;
+}
diff --git b/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h
new file mode 100644
index 0000000..6613c8e
--- /dev/null
+++ b/kernel/power/tuxonice_netlink.h
@@ -0,0 +1,62 @@
+/*
+ * kernel/power/tuxonice_netlink.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Declarations for functions for communicating with a userspace helper
+ * via netlink.
+ */
+
+#include <linux/netlink.h>
+#include <net/sock.h>
+
+#define NETLINK_MSG_BASE 0x10
+
+#define NETLINK_MSG_READY 0x10
+#define        NETLINK_MSG_NOFREEZE_ME 0x16
+#define NETLINK_MSG_GET_DEBUGGING 0x19
+#define NETLINK_MSG_CLEANUP 0x24
+#define NETLINK_MSG_NOFREEZE_ACK 0x27
+#define NETLINK_MSG_IS_DEBUGGING 0x28
+
+struct user_helper_data {
+        int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
+        void (*not_ready) (void);
+        struct sock *nl;
+        u32 sock_seq;
+        pid_t pid;
+        char *comm;
+        char program[256];
+        int pool_level;
+        int pool_limit;
+        struct sk_buff *emerg_skbs;
+        int skb_size;
+        int netlink_id;
+        char *name;
+        struct user_helper_data *next;
+        struct completion wait_for_process;
+        u32 interface_version;
+        int must_init;
+        int debug;
+};
+
+#ifdef CONFIG_NET
+int toi_netlink_setup(struct user_helper_data *uhd);
+void toi_netlink_close(struct user_helper_data *uhd);
+void toi_send_netlink_message(struct user_helper_data *uhd,
+                int type, void *params, size_t len);
+void toi_netlink_close_complete(struct user_helper_data *uhd);
+#else
+static inline int toi_netlink_setup(struct user_helper_data *uhd)
+{
+        return 0;
+}
+
+static inline void toi_netlink_close(struct user_helper_data *uhd) { };
+static inline void toi_send_netlink_message(struct user_helper_data *uhd,
+                int type, void *params, size_t len) { };
+static inline void toi_netlink_close_complete(struct user_helper_data *uhd)
+        { };
+#endif
diff --git b/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c
new file mode 100644
index 0000000..d469f3d
--- /dev/null
+++ b/kernel/power/tuxonice_pagedir.c
@@ -0,0 +1,345 @@
+/*
+ * kernel/power/tuxonice_pagedir.c
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for handling pagesets.
+ * Note that pbes aren't actually stored as such. They're stored as
+ * bitmaps and extents.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <asm/tlbflush.h>
+
+#include "tuxonice_pageflags.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_pagedir.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_alloc.h"
+
+static int ptoi_pfn;
+static struct pbe *this_low_pbe;
+static struct pbe **last_low_pbe_ptr;
+
+void toi_reset_alt_image_pageset2_pfn(void)
+{
+  memory_bm_position_reset(pageset2_map);
+}
+
+static struct page *first_conflicting_page;
+
+/*
+ * free_conflicting_pages
+ */
+
+static void free_conflicting_pages(void)
+{
+        while (first_conflicting_page) {
+                struct page *next =
+                        *((struct page **) kmap(first_conflicting_page));
+                kunmap(first_conflicting_page);
+                toi__free_page(29, first_conflicting_page);
+                first_conflicting_page = next;
+        }
+}
+
+/* __toi_get_nonconflicting_page
+ *
+ * Description: Gets order zero pages that won't be overwritten
+ *                while copying the original pages.
+ */
+
+struct page *___toi_get_nonconflicting_page(int can_be_highmem)
+{
+        struct page *page;
+        gfp_t flags = TOI_ATOMIC_GFP;
+        if (can_be_highmem)
+                flags |= __GFP_HIGHMEM;
+
+
+        if (test_toi_state(TOI_LOADING_ALT_IMAGE) &&
+                        pageset2_map && ptoi_pfn) {
+                do {
+                        ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0);
+                        if (ptoi_pfn != BM_END_OF_MAP) {
+                                page = pfn_to_page(ptoi_pfn);
+                                if (!PagePageset1(page) &&
+                                    (can_be_highmem || !PageHighMem(page)))
+                                        return page;
+                        }
+                } while (ptoi_pfn);
+        }
+
+        do {
+                page = toi_alloc_page(29, flags | __GFP_ZERO);
+                if (!page) {
+                        printk(KERN_INFO "Failed to get nonconflicting "
+                                        "page.\n");
+                        return NULL;
+                }
+                if (PagePageset1(page)) {
+                        struct page **next = (struct page **) kmap(page);
+                        *next = first_conflicting_page;
+                        first_conflicting_page = page;
+                        kunmap(page);
+                }
+        } while (PagePageset1(page));
+
+        return page;
+}
+
+unsigned long __toi_get_nonconflicting_page(void)
+{
+        struct page *page = ___toi_get_nonconflicting_page(0);
+        return page ? (unsigned long) page_address(page) : 0;
+}
+
+static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe,
+                int highmem)
+{
+        if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1))
+                     + 2 * sizeof(struct pbe)) > PAGE_SIZE) {
+                struct page *new_page =
+                        ___toi_get_nonconflicting_page(highmem);
+                if (!new_page)
+                        return ERR_PTR(-ENOMEM);
+                this_pbe = (struct pbe *) kmap(new_page);
+                memset(this_pbe, 0, PAGE_SIZE);
+                *page_ptr = new_page;
+        } else
+                this_pbe++;
+
+        return this_pbe;
+}
+
+/**
+ * get_pageset1_load_addresses - generate pbes for conflicting pages
+ *
+ * We check here that pagedir & pages it points to won't collide
+ * with pages where we're going to restore from the loaded pages
+ * later.
+ *
+ * Returns:
+ *        Zero on success, one if couldn't find enough pages (shouldn't
+ *        happen).
+ **/
+int toi_get_pageset1_load_addresses(void)
+{
+        int pfn, highallocd = 0, lowallocd = 0;
+        int low_needed = pagedir1.size - get_highmem_size(pagedir1);
+        int high_needed = get_highmem_size(pagedir1);
+        int low_pages_for_highmem = 0;
+        gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM;
+        struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL,
+                    *low_pbe_page, *last_low_pbe_page = NULL;
+        struct pbe **last_high_pbe_ptr = &restore_highmem_pblist,
+                   *this_high_pbe = NULL;
+        unsigned long orig_low_pfn, orig_high_pfn;
+        int high_pbes_done = 0, low_pbes_done = 0;
+        int low_direct = 0, high_direct = 0, result = 0, i;
+        int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0;
+
+        toi_trace_index++;
+
+        memory_bm_position_reset(pageset1_map);
+        memory_bm_position_reset(pageset1_copy_map);
+
+        last_low_pbe_ptr = &restore_pblist;
+
+        /* First, allocate pages for the start of our pbe lists. */
+        if (high_needed) {
+                high_pbe_page = ___toi_get_nonconflicting_page(1);
+                if (!high_pbe_page) {
+                        result = -ENOMEM;
+                        goto out;
+                }
+                this_high_pbe = (struct pbe *) kmap(high_pbe_page);
+                memset(this_high_pbe, 0, PAGE_SIZE);
+        }
+
+        low_pbe_page = ___toi_get_nonconflicting_page(0);
+        if (!low_pbe_page) {
+                result = -ENOMEM;
+                goto out;
+        }
+        this_low_pbe = (struct pbe *) page_address(low_pbe_page);
+
+        /*
+         * Next, allocate the number of pages we need.
+         */
+
+        i = low_needed + high_needed;
+
+        do {
+                int is_high;
+
+                if (i == low_needed)
+                        flags &= ~__GFP_HIGHMEM;
+
+                page = toi_alloc_page(30, flags);
+                BUG_ON(!page);
+
+                SetPagePageset1Copy(page);
+                is_high = PageHighMem(page);
+
+                if (PagePageset1(page)) {
+                        if (is_high)
+                                high_direct++;
+                        else
+                                low_direct++;
+                } else {
+                        if (is_high)
+                                highallocd++;
+                        else
+                                lowallocd++;
+                }
+        } while (--i);
+
+        high_needed -= high_direct;
+        low_needed -= low_direct;
+
+        /*
+         * Do we need to use some lowmem pages for the copies of highmem
+         * pages?
+         */
+        if (high_needed > highallocd) {
+                low_pages_for_highmem = high_needed - highallocd;
+                high_needed -= low_pages_for_highmem;
+                low_needed += low_pages_for_highmem;
+        }
+
+        /*
+         * Now generate our pbes (which will be used for the atomic restore),
+         * and free unneeded pages.
+         */
+        memory_bm_position_reset(pageset1_copy_map);
+        for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP;
+                        pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) {
+                int is_high;
+                page = pfn_to_page(pfn);
+                is_high = PageHighMem(page);
+
+                if (PagePageset1(page))
+                        continue;
+
+                /* Nope. We're going to use this page. Add a pbe. */
+                if (is_high || low_pages_for_highmem) {
+                        struct page *orig_page;
+                        high_pbes_done++;
+                        if (!is_high)
+                                low_pages_for_highmem--;
+                        do {
+                                orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0);
+                                BUG_ON(orig_high_pfn == BM_END_OF_MAP);
+                                orig_page = pfn_to_page(orig_high_pfn);
+                        } while (!PageHighMem(orig_page) ||
+                                        PagePageset1Copy(orig_page));
+
+                        this_high_pbe->orig_address = (void *) orig_high_pfn;
+                        this_high_pbe->address = page;
+                        this_high_pbe->next = NULL;
+                        toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p",
+                                        high_page, high_offset, page, orig_high_pfn, orig_page);
+                        if (last_high_pbe_page != high_pbe_page) {
+                                *last_high_pbe_ptr =
+                                        (struct pbe *) high_pbe_page;
+                                if (last_high_pbe_page) {
+                                        kunmap(last_high_pbe_page);
+                                        high_page++;
+                                        high_offset = 0;
+                                } else
+                                        high_offset++;
+                                last_high_pbe_page = high_pbe_page;
+                        } else {
+                                *last_high_pbe_ptr = this_high_pbe;
+                                high_offset++;
+                        }
+                        last_high_pbe_ptr = &this_high_pbe->next;
+                        this_high_pbe = get_next_pbe(&high_pbe_page,
+                                        this_high_pbe, 1);
+                        if (IS_ERR(this_high_pbe)) {
+                                printk(KERN_INFO
+                                                "This high pbe is an error.\n");
+                                return -ENOMEM;
+                        }
+                } else {
+                        struct page *orig_page;
+                        low_pbes_done++;
+                        do {
+                                orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0);
+                                BUG_ON(orig_low_pfn == BM_END_OF_MAP);
+                                orig_page = pfn_to_page(orig_low_pfn);
+                        } while (PageHighMem(orig_page) ||
+                                        PagePageset1Copy(orig_page));
+
+                        this_low_pbe->orig_address = page_address(orig_page);
+                        this_low_pbe->address = page_address(page);
+                        this_low_pbe->next = NULL;
+                        toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p",
+                                        low_page, low_offset, this_low_pbe->orig_address,
+                                        orig_low_pfn, this_low_pbe->address);
+                        TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address);
+                        *last_low_pbe_ptr = this_low_pbe;
+                        last_low_pbe_ptr = &this_low_pbe->next;
+                        this_low_pbe = get_next_pbe(&low_pbe_page,
+                                        this_low_pbe, 0);
+                        if (low_pbe_page != last_low_pbe_page) {
+                                if (last_low_pbe_page) {
+                                        low_page++;
+                                        low_offset = 0;
+                                } else {
+                                    low_offset++;
+                                }
+                                last_low_pbe_page = low_pbe_page;
+                        } else
+                                low_offset++;
+                        if (IS_ERR(this_low_pbe)) {
+                                printk(KERN_INFO "this_low_pbe is an error.\n");
+                                return -ENOMEM;
+                        }
+                }
+        }
+
+        if (high_pbe_page)
+                kunmap(high_pbe_page);
+
+        if (last_high_pbe_page != high_pbe_page) {
+                if (last_high_pbe_page)
+                        kunmap(last_high_pbe_page);
+                toi__free_page(29, high_pbe_page);
+        }
+
+        free_conflicting_pages();
+
+out:
+        return result;
+}
+
+int add_boot_kernel_data_pbe(void)
+{
+        this_low_pbe->address = (char *) __toi_get_nonconflicting_page();
+        if (!this_low_pbe->address) {
+                printk(KERN_INFO "Failed to get bkd atomic restore buffer.");
+                return -ENOMEM;
+        }
+
+        toi_bkd.size = sizeof(toi_bkd);
+        memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd));
+
+        *last_low_pbe_ptr = this_low_pbe;
+        this_low_pbe->orig_address = (char *) boot_kernel_data_buffer;
+        this_low_pbe->next = NULL;
+        return 0;
+}
diff --git b/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h
new file mode 100644
index 0000000..0465359
--- /dev/null
+++ b/kernel/power/tuxonice_pagedir.h
@@ -0,0 +1,50 @@
+/*
+ * kernel/power/tuxonice_pagedir.h
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Declarations for routines for handling pagesets.
+ */
+
+#ifndef KERNEL_POWER_PAGEDIR_H
+#define KERNEL_POWER_PAGEDIR_H
+
+/* Pagedir
+ *
+ * Contains the metadata for a set of pages saved in the image.
+ */
+
+struct pagedir {
+        int id;
+        unsigned long size;
+#ifdef CONFIG_HIGHMEM
+        unsigned long size_high;
+#endif
+};
+
+#ifdef CONFIG_HIGHMEM
+#define get_highmem_size(pagedir) (pagedir.size_high)
+#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0)
+#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0)
+#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high)
+#else
+#define get_highmem_size(pagedir) (0)
+#define set_highmem_size(pagedir, sz) do { } while (0)
+#define inc_highmem_size(pagedir) do { } while (0)
+#define get_lowmem_size(pagedir) (pagedir.size)
+#endif
+
+extern struct pagedir pagedir1, pagedir2;
+
+extern void toi_copy_pageset1(void);
+
+extern int toi_get_pageset1_load_addresses(void);
+
+extern unsigned long __toi_get_nonconflicting_page(void);
+struct page *___toi_get_nonconflicting_page(int can_be_highmem);
+
+extern void toi_reset_alt_image_pageset2_pfn(void);
+extern int add_boot_kernel_data_pbe(void);
+#endif
diff --git b/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c
new file mode 100644
index 0000000..0fe92ed
--- /dev/null
+++ b/kernel/power/tuxonice_pageflags.c
@@ -0,0 +1,18 @@
+/*
+ * kernel/power/tuxonice_pageflags.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for serialising and relocating pageflags in which we
+ * store our image metadata.
+ */
+
+#include "tuxonice_pageflags.h"
+#include "power.h"
+
+int toi_pageflags_space_needed(void)
+{
+        return memory_bm_space_needed(pageset1_map);
+}
diff --git b/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h
new file mode 100644
index 0000000..ddeeaf1
--- /dev/null
+++ b/kernel/power/tuxonice_pageflags.h
@@ -0,0 +1,106 @@
+/*
+ * kernel/power/tuxonice_pageflags.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H
+#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H
+
+struct  memory_bitmap;
+void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+void memory_bm_clear(struct memory_bitmap *bm);
+
+int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn);
+void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
+unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
+unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index);
+void memory_bm_position_reset(struct memory_bitmap *bm);
+void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+int toi_alloc_bitmap(struct memory_bitmap **bm);
+void toi_free_bitmap(struct memory_bitmap **bm);
+void memory_bm_clear(struct memory_bitmap *bm);
+void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
+void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
+int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
+int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
+void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
+
+struct toi_module_ops;
+int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
+        (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
+int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
+        (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
+int memory_bm_space_needed(struct memory_bitmap *bm);
+
+extern struct memory_bitmap *pageset1_map;
+extern struct memory_bitmap *pageset1_copy_map;
+extern struct memory_bitmap *pageset2_map;
+extern struct memory_bitmap *page_resave_map;
+extern struct memory_bitmap *io_map;
+extern struct memory_bitmap *nosave_map;
+extern struct memory_bitmap *free_map;
+extern struct memory_bitmap *compare_map;
+
+#define PagePageset1(page) \
+        (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
+#define SetPagePageset1(page) \
+        (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPagePageset1(page) \
+        (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PagePageset1Copy(page) \
+        (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
+#define SetPagePageset1Copy(page) \
+        (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPagePageset1Copy(page) \
+        (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PagePageset2(page) \
+        (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+#define SetPagePageset2(page) \
+        (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPagePageset2(page) \
+        (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PageWasRW(page) \
+        (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+#define SetPageWasRW(page) \
+        (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPageWasRW(page) \
+        (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PageResave(page) (page_resave_map ? \
+        memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0)
+#define SetPageResave(page) \
+        (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPageResave(page) \
+        (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PageNosave(page) (nosave_map ? \
+        memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0)
+#define SetPageNosave(page) \
+        (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPageNosave(page) \
+        (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PageNosaveFree(page) (free_map ? \
+                memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0)
+#define SetPageNosaveFree(page) \
+        (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPageNosaveFree(page) \
+        (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PageCompareChanged(page) (compare_map ? \
+                memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0)
+#define SetPageCompareChanged(page) \
+        (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPageCompareChanged(page) \
+        (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
+
+extern void save_pageflags(struct memory_bitmap *pagemap);
+extern int load_pageflags(struct memory_bitmap *pagemap);
+extern int toi_pageflags_space_needed(void);
+#endif
diff --git b/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c
new file mode 100644
index 0000000..7c78773
--- /dev/null
+++ b/kernel/power/tuxonice_power_off.c
@@ -0,0 +1,286 @@
+/*
+ * kernel/power/tuxonice_power_off.c
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Support for powering down.
+ */
+
+#include <linux/device.h>
+#include <linux/suspend.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+#include <linux/reboot.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
+#include <linux/fs.h>
+#include "tuxonice.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_power_off.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_io.h"
+
+unsigned long toi_poweroff_method; /* 0 - Kernel power off */
+
+static int wake_delay;
+static char lid_state_file[256], wake_alarm_dir[256];
+static struct file *lid_file, *alarm_file, *epoch_file;
+static int post_wake_state = -1;
+
+static int did_suspend_to_both;
+
+/*
+ * __toi_power_down
+ * Functionality   : Powers down or reboots the computer once the image
+ *                   has been written to disk.
+ * Key Assumptions : Able to reboot/power down via code called or that
+ *                   the warning emitted if the calls fail will be visible
+ *                   to the user (ie printk resumes devices).
+ */
+
+static void __toi_power_down(int method)
+{
+        int error;
+
+        toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." :
+                        "Powering down.");
+
+        if (test_result_state(TOI_ABORTED))
+                goto out;
+
+        if (test_action_state(TOI_REBOOT))
+                kernel_restart(NULL);
+
+        switch (method) {
+        case 0:
+                break;
+        case 3:
+                /*
+                 * Re-read the overwritten part of pageset2 to make post-resume
+                 * faster.
+                 */
+                if (read_pageset2(1))
+                        panic("Attempt to reload pagedir 2 failed. "
+                                        "Try rebooting.");
+
+                pm_prepare_console();
+
+                error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+                if (!error) {
+                        pm_restore_gfp_mask();
+                        error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+                        pm_restrict_gfp_mask();
+                        if (!error)
+                                did_suspend_to_both = 1;
+                }
+                pm_notifier_call_chain(PM_POST_SUSPEND);
+                pm_restore_console();
+
+                /* Success - we're now post-resume-from-ram */
+                if (did_suspend_to_both)
+                        return;
+
+                /* Failed to suspend to ram - do normal power off */
+                break;
+        case 4:
+                /*
+                 * If succeeds, doesn't return. If fails, do a simple
+                 * powerdown.
+                 */
+                hibernation_platform_enter();
+                break;
+        case 5:
+                /* Historic entry only now */
+                break;
+        }
+
+        if (method && method != 5)
+                toi_cond_pause(1,
+                        "Falling back to alternate power off method.");
+
+        if (test_result_state(TOI_ABORTED))
+                goto out;
+
+        if (pm_power_off)
+            kernel_power_off();
+        kernel_halt();
+        toi_cond_pause(1, "Powerdown failed.");
+        while (1)
+                cpu_relax();
+
+out:
+        if (read_pageset2(1))
+                panic("Attempt to reload pagedir 2 failed. Try rebooting.");
+        return;
+}
+
+#define CLOSE_FILE(file) \
+        if (file) { \
+                filp_close(file, NULL); file = NULL; \
+        }
+
+static void powerdown_cleanup(int toi_or_resume)
+{
+        if (!toi_or_resume)
+                return;
+
+        CLOSE_FILE(lid_file);
+        CLOSE_FILE(alarm_file);
+        CLOSE_FILE(epoch_file);
+}
+
+static void open_file(char *format, char *arg, struct file **var, int mode,
+                char *desc)
+{
+        char buf[256];
+
+        if (strlen(arg)) {
+                sprintf(buf, format, arg);
+                *var = filp_open(buf, mode, 0);
+                if (IS_ERR(*var) || !*var) {
+                        printk(KERN_INFO "Failed to open %s file '%s' (%p).\n",
+                                desc, buf, *var);
+                        *var = NULL;
+                }
+        }
+}
+
+static int powerdown_init(int toi_or_resume)
+{
+        if (!toi_or_resume)
+                return 0;
+
+        did_suspend_to_both = 0;
+
+        open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file,
+                        O_RDONLY, "lid");
+
+        if (strlen(wake_alarm_dir)) {
+                open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir,
+                                &alarm_file, O_WRONLY, "alarm");
+
+                open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir,
+                                &epoch_file, O_RDONLY, "epoch");
+        }
+
+        return 0;
+}
+
+static int lid_closed(void)
+{
+        char array[25];
+        ssize_t size;
+        loff_t pos = 0;
+
+        if (!lid_file)
+                return 0;
+
+        size = vfs_read(lid_file, (char __user *) array, 25, &pos);
+        if ((int) size < 1) {
+                printk(KERN_INFO "Failed to read lid state file (%d).\n",
+                        (int) size);
+                return 0;
+        }
+
+        if (!strcmp(array, "state:      closed\n"))
+                return 1;
+
+        return 0;
+}
+
+static void write_alarm_file(int value)
+{
+        ssize_t size;
+        char buf[40];
+        loff_t pos = 0;
+
+        if (!alarm_file)
+                return;
+
+        sprintf(buf, "%d\n", value);
+
+        size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos);
+
+        if (size < 0)
+                printk(KERN_INFO "Error %d writing alarm value %s.\n",
+                                (int) size, buf);
+}
+
+/**
+ * toi_check_resleep: See whether to powerdown again after waking.
+ *
+ * After waking, check whether we should powerdown again in a (usually
+ * different) way. We only do this if the lid switch is still closed.
+ */
+void toi_check_resleep(void)
+{
+        /* We only return if we suspended to ram and woke. */
+        if (lid_closed() && post_wake_state >= 0)
+                __toi_power_down(post_wake_state);
+}
+
+void toi_power_down(void)
+{
+        if (alarm_file && wake_delay) {
+                char array[25];
+                loff_t pos = 0;
+                size_t size = vfs_read(epoch_file, (char __user *) array, 25,
+                                &pos);
+
+                if (((int) size) < 1)
+                        printk(KERN_INFO "Failed to read epoch file (%d).\n",
+                                        (int) size);
+                else {
+                        unsigned long since_epoch;
+                        if (!kstrtoul(array, 0, &since_epoch)) {
+                                /* Clear any wakeup time. */
+                                write_alarm_file(0);
+
+                                /* Set new wakeup time. */
+                                write_alarm_file(since_epoch + wake_delay);
+                        }
+                }
+        }
+
+        __toi_power_down(toi_poweroff_method);
+
+        toi_check_resleep();
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+#if defined(CONFIG_ACPI)
+        SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL),
+        SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL),
+        SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL),
+        SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0,
+                        NULL),
+        SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0),
+        SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both,
+                0, 0, 0, NULL)
+#endif
+};
+
+static struct toi_module_ops powerdown_ops = {
+        .type                                = MISC_HIDDEN_MODULE,
+        .name                                = "poweroff",
+        .initialise                        = powerdown_init,
+        .cleanup                        = powerdown_cleanup,
+        .directory                        = "[ROOT]",
+        .module                                = THIS_MODULE,
+        .sysfs_data                        = sysfs_params,
+        .num_sysfs_entries                = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+int toi_poweroff_init(void)
+{
+        return toi_register_module(&powerdown_ops);
+}
+
+void toi_poweroff_exit(void)
+{
+        toi_unregister_module(&powerdown_ops);
+}
diff --git b/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h
new file mode 100644
index 0000000..6e1d8bb
--- /dev/null
+++ b/kernel/power/tuxonice_power_off.h
@@ -0,0 +1,24 @@
+/*
+ * kernel/power/tuxonice_power_off.h
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Support for the powering down.
+ */
+
+int toi_pm_state_finish(void);
+void toi_power_down(void);
+extern unsigned long toi_poweroff_method;
+int toi_poweroff_init(void);
+void toi_poweroff_exit(void);
+void toi_check_resleep(void);
+
+extern int platform_begin(int platform_mode);
+extern int platform_pre_snapshot(int platform_mode);
+extern void platform_leave(int platform_mode);
+extern void platform_end(int platform_mode);
+extern void platform_finish(int platform_mode);
+extern int platform_pre_restore(int platform_mode);
+extern void platform_restore_cleanup(int platform_mode);
diff --git b/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c
new file mode 100644
index 0000000..df37cc8
--- /dev/null
+++ b/kernel/power/tuxonice_prepare_image.c
@@ -0,0 +1,1089 @@
+/*
+ * kernel/power/tuxonice_prepare_image.c
+ *
+ * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * We need to eat memory until we can:
+ * 1. Perform the save without changing anything (RAM_NEEDED < #pages)
+ * 2. Fit it all in available space (toiActiveAllocator->available_space() >=
+ *    main_storage_needed())
+ * 3. Reload the pagedir and pageset1 to places that don't collide with their
+ *    final destinations, not knowing to what extent the resumed kernel will
+ *    overlap with the one loaded at boot time. I think the resumed kernel
+ *    should overlap completely, but I don't want to rely on this as it is
+ *    an unproven assumption. We therefore assume there will be no overlap at
+ *    all (worse case).
+ * 4. Meet the user's requested limit (if any) on the size of the image.
+ *    The limit is in MB, so pages/256 (assuming 4K pages).
+ *
+ */
+
+#include <linux/highmem.h>
+#include <linux/freezer.h>
+#include <linux/hardirq.h>
+#include <linux/mmzone.h>
+#include <linux/console.h>
+#include <linux/tuxonice.h>
+
+#include "tuxonice_pageflags.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_checksum.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_atomic_copy.h"
+#include "tuxonice_builtin.h"
+
+static unsigned long num_nosave, main_storage_allocated, storage_limit,
+            header_storage_needed;
+unsigned long extra_pd1_pages_allowance =
+        CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE;
+long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT;
+static int no_ps2_needed;
+
+struct attention_list {
+        struct task_struct *task;
+        struct attention_list *next;
+};
+
+static struct attention_list *attention_list;
+
+#define PAGESET1 0
+#define PAGESET2 1
+
+void free_attention_list(void)
+{
+        struct attention_list *last = NULL;
+
+        while (attention_list) {
+                last = attention_list;
+                attention_list = attention_list->next;
+                toi_kfree(6, last, sizeof(*last));
+        }
+}
+
+static int build_attention_list(void)
+{
+        int i, task_count = 0;
+        struct task_struct *p;
+        struct attention_list *next;
+
+        /*
+         * Count all userspace process (with task->mm) marked PF_NOFREEZE.
+         */
+        toi_read_lock_tasklist();
+        for_each_process(p)
+                if ((p->flags & PF_NOFREEZE) || p == current)
+                        task_count++;
+        toi_read_unlock_tasklist();
+
+        /*
+         * Allocate attention list structs.
+         */
+        for (i = 0; i < task_count; i++) {
+                struct attention_list *this =
+                        toi_kzalloc(6, sizeof(struct attention_list),
+                                        TOI_WAIT_GFP);
+                if (!this) {
+                        printk(KERN_INFO "Failed to allocate slab for "
+                                        "attention list.\n");
+                        free_attention_list();
+                        return 1;
+                }
+                this->next = NULL;
+                if (attention_list)
+                        this->next = attention_list;
+                attention_list = this;
+        }
+
+        next = attention_list;
+        toi_read_lock_tasklist();
+        for_each_process(p)
+                if ((p->flags & PF_NOFREEZE) || p == current) {
+                        next->task = p;
+                        next = next->next;
+                }
+        toi_read_unlock_tasklist();
+        return 0;
+}
+
+static void pageset2_full(void)
+{
+        struct zone *zone;
+        struct page *page;
+        unsigned long flags;
+        int i;
+
+        toi_trace_index++;
+
+        for_each_populated_zone(zone) {
+                spin_lock_irqsave(&zone->lru_lock, flags);
+                for_each_lru(i) {
+                        if (!zone_page_state(zone, NR_LRU_BASE + i))
+                                continue;
+
+                        list_for_each_entry(page, &zone->lruvec.lists[i], lru) {
+                                struct address_space *mapping;
+
+                                mapping = page_mapping(page);
+                                if (!mapping || !mapping->host ||
+                                    !(mapping->host->i_flags & S_ATOMIC_COPY)) {
+                                    if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
+                                        TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified.");
+                                    } else {
+                                        TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full.");
+                                        SetPagePageset2(page);
+                                    }
+                                }
+                        }
+                }
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
+        }
+}
+
+/*
+ * toi_mark_task_as_pageset
+ * Functionality   : Marks all the saveable pages belonging to a given process
+ *                      as belonging to a particular pageset.
+ */
+
+static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2)
+{
+        struct vm_area_struct *vma;
+        struct mm_struct *mm;
+
+        mm = t->active_mm;
+
+        if (!mm || !mm->mmap)
+                return;
+
+        toi_trace_index++;
+
+        if (!irqs_disabled())
+                down_read(&mm->mmap_sem);
+
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                unsigned long posn;
+
+                if (!vma->vm_start ||
+                    vma->vm_flags & VM_PFNMAP)
+                        continue;
+
+                for (posn = vma->vm_start; posn < vma->vm_end;
+                                posn += PAGE_SIZE) {
+                        struct page *page = follow_page(vma, posn, 0);
+                        struct address_space *mapping;
+
+                        if (!page || !pfn_valid(page_to_pfn(page)))
+                                continue;
+
+                        mapping = page_mapping(page);
+                        if (mapping && mapping->host &&
+                            mapping->host->i_flags & S_ATOMIC_COPY && pageset2)
+                                continue;
+
+                        if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
+                                TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2);
+                                continue;
+                        }
+
+                        if (pageset2) {
+                                TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1");
+                                SetPagePageset2(page);
+                        } else {
+                                TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2");
+                                ClearPagePageset2(page);
+                                SetPagePageset1(page);
+                        }
+                }
+        }
+
+        if (!irqs_disabled())
+                up_read(&mm->mmap_sem);
+}
+
+static void mark_tasks(int pageset)
+{
+        struct task_struct *p;
+
+        toi_read_lock_tasklist();
+        for_each_process(p) {
+                if (!p->mm)
+                        continue;
+
+                if (p->flags & PF_KTHREAD)
+                        continue;
+
+                toi_mark_task_as_pageset(p, pageset);
+        }
+        toi_read_unlock_tasklist();
+
+}
+
+/* mark_pages_for_pageset2
+ *
+ * Description:        Mark unshared pages in processes not needed for hibernate as
+ *                 being able to be written out in a separate pagedir.
+ *                 HighMem pages are simply marked as pageset2. They won't be
+ *                 needed during hibernate.
+ */
+
+static void toi_mark_pages_for_pageset2(void)
+{
+        struct attention_list *this = attention_list;
+
+        memory_bm_clear(pageset2_map);
+
+        if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed)
+                return;
+
+        if (test_action_state(TOI_PAGESET2_FULL))
+                pageset2_full();
+        else
+                mark_tasks(PAGESET2);
+
+        /*
+         * Because the tasks in attention_list are ones related to hibernating,
+         * we know that they won't go away under us.
+         */
+
+        while (this) {
+                if (!test_result_state(TOI_ABORTED))
+                        toi_mark_task_as_pageset(this->task, PAGESET1);
+                this = this->next;
+        }
+}
+
+/*
+ * The atomic copy of pageset1 is stored in pageset2 pages.
+ * But if pageset1 is larger (normally only just after boot),
+ * we need to allocate extra pages to store the atomic copy.
+ * The following data struct and functions are used to handle
+ * the allocation and freeing of that memory.
+ */
+
+static unsigned long extra_pages_allocated;
+
+struct extras {
+        struct page *page;
+        int order;
+        struct extras *next;
+};
+
+static struct extras *extras_list;
+
+/* toi_free_extra_pagedir_memory
+ *
+ * Description:        Free previously allocated extra pagedir memory.
+ */
+void toi_free_extra_pagedir_memory(void)
+{
+        /* Free allocated pages */
+        while (extras_list) {
+                struct extras *this = extras_list;
+                int i;
+
+                extras_list = this->next;
+
+                for (i = 0; i < (1 << this->order); i++)
+                        ClearPageNosave(this->page + i);
+
+                toi_free_pages(9, this->page, this->order);
+                toi_kfree(7, this, sizeof(*this));
+        }
+
+        extra_pages_allocated = 0;
+}
+
+/* toi_allocate_extra_pagedir_memory
+ *
+ * Description:        Allocate memory for making the atomic copy of pagedir1 in the
+ *                 case where it is bigger than pagedir2.
+ * Arguments:        int        num_to_alloc: Number of extra pages needed.
+ * Result:        int.         Number of extra pages we now have allocated.
+ */
+static int toi_allocate_extra_pagedir_memory(int extra_pages_needed)
+{
+        int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated;
+        gfp_t flags = TOI_ATOMIC_GFP;
+
+        if (num_to_alloc < 1)
+                return 0;
+
+        order = fls(num_to_alloc);
+        if (order >= MAX_ORDER)
+                order = MAX_ORDER - 1;
+
+        while (num_to_alloc) {
+                struct page *newpage;
+                unsigned long virt;
+                struct extras *extras_entry;
+
+                while ((1 << order) > num_to_alloc)
+                        order--;
+
+                extras_entry = (struct extras *) toi_kzalloc(7,
+                        sizeof(struct extras), TOI_ATOMIC_GFP);
+
+                if (!extras_entry)
+                        return extra_pages_allocated;
+
+                virt = toi_get_free_pages(9, flags, order);
+                while (!virt && order) {
+                        order--;
+                        virt = toi_get_free_pages(9, flags, order);
+                }
+
+                if (!virt) {
+                        toi_kfree(7, extras_entry, sizeof(*extras_entry));
+                        return extra_pages_allocated;
+                }
+
+                newpage = virt_to_page(virt);
+
+                extras_entry->page = newpage;
+                extras_entry->order = order;
+                extras_entry->next = extras_list;
+
+                extras_list = extras_entry;
+
+                for (j = 0; j < (1 << order); j++) {
+                        SetPageNosave(newpage + j);
+                        SetPagePageset1Copy(newpage + j);
+                }
+
+                extra_pages_allocated += (1 << order);
+                num_to_alloc -= (1 << order);
+        }
+
+        return extra_pages_allocated;
+}
+
+/*
+ * real_nr_free_pages: Count pcp pages for a zone type or all zones
+ * (-1 for all, otherwise zone_idx() result desired).
+ */
+unsigned long real_nr_free_pages(unsigned long zone_idx_mask)
+{
+        struct zone *zone;
+        int result = 0, cpu;
+
+        /* PCP lists */
+        for_each_populated_zone(zone) {
+                if (!(zone_idx_mask & (1 << zone_idx(zone))))
+                        continue;
+
+                for_each_online_cpu(cpu) {
+                        struct per_cpu_pageset *pset =
+                                per_cpu_ptr(zone->pageset, cpu);
+                        struct per_cpu_pages *pcp = &pset->pcp;
+                        result += pcp->count;
+                }
+
+                result += zone_page_state(zone, NR_FREE_PAGES);
+        }
+        return result;
+}
+
+/*
+ * Discover how much extra memory will be required by the drivers
+ * when they're asked to hibernate. We can then ensure that amount
+ * of memory is available when we really want it.
+ */
+static void get_extra_pd1_allowance(void)
+{
+        unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final;
+
+        toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
+
+        if (toi_go_atomic(PMSG_FREEZE, 1))
+                return;
+
+        final = real_nr_free_pages(all_zones_mask);
+        toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0);
+
+        extra_pd1_pages_allowance = (orig_num_free > final) ?
+                orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE :
+                MIN_EXTRA_PAGES_ALLOWANCE;
+}
+
+/*
+ * Amount of storage needed, possibly taking into account the
+ * expected compression ratio and possibly also ignoring our
+ * allowance for extra pages.
+ */
+static unsigned long main_storage_needed(int use_ecr,
+                int ignore_extra_pd1_allow)
+{
+        return (pagedir1.size + pagedir2.size +
+          (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
+         (use_ecr ? toi_expected_compression_ratio() : 100) / 100;
+}
+
+/*
+ * Storage needed for the image header, in bytes until the return.
+ *
+ * fs_info_space_needed is saved in a static variable unless we
+ * explicitly want to reset the value (done at the start of a cycle)
+ * as it requires memory allocation that may result in a hang if we're
+ * also trying to free memory.
+ */
+unsigned long get_header_storage_needed(int reset)
+{
+        unsigned long bytes = sizeof(struct toi_header) +
+                        toi_header_storage_for_modules() +
+                        toi_pageflags_space_needed() +
+                        fs_info_space_needed(0);
+
+        return DIV_ROUND_UP(bytes, PAGE_SIZE);
+}
+
+/*
+ * When freeing memory, pages from either pageset might be freed.
+ *
+ * When seeking to free memory to be able to hibernate, for every ps1 page
+ * freed, we need 2 less pages for the atomic copy because there is one less
+ * page to copy and one more page into which data can be copied.
+ *
+ * Freeing ps2 pages saves us nothing directly. No more memory is available
+ * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but
+ * that's too much work to figure out.
+ *
+ * => ps1_to_free functions
+ *
+ * Of course if we just want to reduce the image size, because of storage
+ * limitations or an image size limit either ps will do.
+ *
+ * => any_to_free function
+ */
+
+static unsigned long lowpages_usable_for_highmem_copy(void)
+{
+        unsigned long needed = get_lowmem_size(pagedir1) +
+                        extra_pd1_pages_allowance + MIN_FREE_RAM +
+                        toi_memory_for_modules(0),
+                available = get_lowmem_size(pagedir2) +
+                         real_nr_free_low_pages() + extra_pages_allocated;
+
+        return available > needed ? available - needed : 0;
+}
+
+static unsigned long highpages_ps1_to_free(void)
+{
+        unsigned long need = get_highmem_size(pagedir1),
+                      available = get_highmem_size(pagedir2) +
+                              real_nr_free_high_pages() +
+                              lowpages_usable_for_highmem_copy();
+
+        return need > available ? DIV_ROUND_UP(need - available, 2) : 0;
+}
+
+static unsigned long lowpages_ps1_to_free(void)
+{
+        unsigned long needed = get_lowmem_size(pagedir1) +
+                        extra_pd1_pages_allowance + MIN_FREE_RAM +
+                        toi_memory_for_modules(0),
+                available = get_lowmem_size(pagedir2) +
+                         real_nr_free_low_pages() + extra_pages_allocated;
+
+        return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0;
+}
+
+static unsigned long current_image_size(void)
+{
+        return pagedir1.size + pagedir2.size + header_storage_needed;
+}
+
+static unsigned long storage_still_required(void)
+{
+        unsigned long needed = main_storage_needed(1, 1);
+        return needed > storage_limit ? needed - storage_limit : 0;
+}
+
+static unsigned long ram_still_required(void)
+{
+        unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) +
+                2 * extra_pd1_pages_allowance,
+                  available = real_nr_free_low_pages() + extra_pages_allocated;
+        return needed > available ? needed - available : 0;
+}
+
+unsigned long any_to_free(int use_image_size_limit)
+{
+        int use_soft_limit = use_image_size_limit && image_size_limit > 0;
+        unsigned long current_size = current_image_size(),
+                      soft_limit = use_soft_limit ? (image_size_limit << 8) : 0,
+                      to_free = use_soft_limit ? (current_size > soft_limit ?
+                                      current_size - soft_limit : 0) : 0,
+                      storage_limit = storage_still_required(),
+                      ram_limit = ram_still_required(),
+                      first_max = max(to_free, storage_limit);
+
+        return max(first_max, ram_limit);
+}
+
+static int need_pageset2(void)
+{
+        return (real_nr_free_low_pages() + extra_pages_allocated -
+                2 * extra_pd1_pages_allowance - MIN_FREE_RAM -
+                 toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size;
+}
+
+/* amount_needed
+ *
+ * Calculates the amount by which the image size needs to be reduced to meet
+ * our constraints.
+ */
+static unsigned long amount_needed(int use_image_size_limit)
+{
+        return max(highpages_ps1_to_free() + lowpages_ps1_to_free(),
+                        any_to_free(use_image_size_limit));
+}
+
+static int image_not_ready(int use_image_size_limit)
+{
+        toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
+                "Amount still needed (%lu) > 0:%u,"
+                " Storage allocd: %lu < %lu: %u.\n",
+                        amount_needed(use_image_size_limit),
+                        (amount_needed(use_image_size_limit) > 0),
+                        main_storage_allocated,
+                        main_storage_needed(1, 1),
+                        main_storage_allocated < main_storage_needed(1, 1));
+
+        toi_cond_pause(0, NULL);
+
+        return (amount_needed(use_image_size_limit) > 0) ||
+                 main_storage_allocated < main_storage_needed(1, 1);
+}
+
+static void display_failure_reason(int tries_exceeded)
+{
+        unsigned long storage_required = storage_still_required(),
+            ram_required = ram_still_required(),
+            high_ps1 = highpages_ps1_to_free(),
+            low_ps1 = lowpages_ps1_to_free();
+
+        printk(KERN_INFO "Failed to prepare the image because...\n");
+
+        if (!storage_limit) {
+                printk(KERN_INFO "- You need some storage available to be "
+                                "able to hibernate.\n");
+                return;
+        }
+
+        if (tries_exceeded)
+                printk(KERN_INFO "- The maximum number of iterations was "
+                                "reached without successfully preparing the "
+                                "image.\n");
+
+        if (storage_required) {
+                printk(KERN_INFO " - We need at least %lu pages of storage "
+                                "(ignoring the header), but only have %lu.\n",
+                                main_storage_needed(1, 1),
+                                main_storage_allocated);
+                set_abort_result(TOI_INSUFFICIENT_STORAGE);
+        }
+
+        if (ram_required) {
+                printk(KERN_INFO " - We need %lu more free pages of low "
+                                "memory.\n", ram_required);
+                printk(KERN_INFO "     Minimum free     : %8d\n", MIN_FREE_RAM);
+                printk(KERN_INFO "   + Reqd. by modules : %8lu\n",
+                                toi_memory_for_modules(0));
+                printk(KERN_INFO "   + 2 * extra allow  : %8lu\n",
+                                2 * extra_pd1_pages_allowance);
+                printk(KERN_INFO "   - Currently free   : %8lu\n",
+                                real_nr_free_low_pages());
+                printk(KERN_INFO "   - Pages allocd     : %8lu\n",
+                                extra_pages_allocated);
+                printk(KERN_INFO "                      : ========\n");
+                printk(KERN_INFO "     Still needed     : %8lu\n",
+                                ram_required);
+
+                /* Print breakdown of memory needed for modules */
+                toi_memory_for_modules(1);
+                set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
+        }
+
+        if (high_ps1) {
+                printk(KERN_INFO "- We need to free %lu highmem pageset 1 "
+                                "pages.\n", high_ps1);
+                set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
+        }
+
+        if (low_ps1) {
+                printk(KERN_INFO " - We need to free %ld lowmem pageset 1 "
+                                "pages.\n", low_ps1);
+                set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
+        }
+}
+
+static void display_stats(int always, int sub_extra_pd1_allow)
+{
+        char buffer[255];
+        snprintf(buffer, 254,
+                "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). "
+                "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). "
+                "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n",
+
+                /* Free */
+                real_nr_free_pages(all_zones_mask),
+                real_nr_free_low_pages(),
+
+                /* Sets */
+                pagedir1.size, pagedir1.size - get_highmem_size(pagedir1),
+                pagedir2.size, pagedir2.size - get_highmem_size(pagedir2),
+
+                /* Nosave */
+                num_nosave, extra_pages_allocated,
+                num_nosave - extra_pages_allocated,
+
+                /* Storage */
+                main_storage_allocated,
+                storage_limit,
+                main_storage_needed(1, sub_extra_pd1_allow),
+                main_storage_needed(1, 1),
+
+                /* Needed */
+                lowpages_ps1_to_free(), highpages_ps1_to_free(),
+                any_to_free(1),
+                MIN_FREE_RAM, toi_memory_for_modules(0),
+                extra_pd1_pages_allowance,
+                image_size_limit,
+
+                need_pageset2() ? "yes" : "no");
+
+        if (always)
+                printk("%s", buffer);
+        else
+                toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer);
+}
+
+/* flag_image_pages
+ *
+ * This routine generates our lists of pages to be stored in each
+ * pageset. Since we store the data using extents, and adding new
+ * extents might allocate a new extent page, this routine may well
+ * be called more than once.
+ */
+static void flag_image_pages(int atomic_copy)
+{
+        int num_free = 0, num_unmodified = 0;
+        unsigned long loop;
+        struct zone *zone;
+
+        pagedir1.size = 0;
+        pagedir2.size = 0;
+
+        set_highmem_size(pagedir1, 0);
+        set_highmem_size(pagedir2, 0);
+
+        num_nosave = 0;
+        toi_trace_index++;
+
+        memory_bm_clear(pageset1_map);
+
+        toi_generate_free_page_map();
+
+        /*
+         * Pages not to be saved are marked Nosave irrespective of being
+         * reserved.
+         */
+        for_each_populated_zone(zone) {
+                int highmem = is_highmem(zone);
+
+                for (loop = 0; loop < zone->spanned_pages; loop++) {
+                        unsigned long pfn = zone->zone_start_pfn + loop;
+                        struct page *page;
+                        int chunk_size;
+
+                        if (!pfn_valid(pfn)) {
+                            TOI_TRACE_DEBUG(pfn, "_Flag Invalid");
+                            continue;
+                        }
+
+                        chunk_size = toi_size_of_free_region(zone, pfn);
+                        if (chunk_size) {
+                            unsigned long y;
+                            for (y = pfn; y < pfn + chunk_size; y++) {
+                                page = pfn_to_page(y);
+                                TOI_TRACE_DEBUG(y, "_Flag Free");
+                                ClearPagePageset1(page);
+                                ClearPagePageset2(page);
+                            }
+                                num_free += chunk_size;
+                                loop += chunk_size - 1;
+                                continue;
+                        }
+
+                        page = pfn_to_page(pfn);
+
+                        if (PageNosave(page)) {
+                            char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave";
+                            TOI_TRACE_DEBUG(pfn, "_Flag %s", desc);
+                            num_nosave++;
+                            continue;
+                        }
+
+                        page = highmem ? saveable_highmem_page(zone, pfn) :
+                                saveable_page(zone, pfn);
+
+                        if (!page) {
+                                TOI_TRACE_DEBUG(pfn, "_Flag Nosave2");
+                                num_nosave++;
+                                continue;
+                        }
+
+                        if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
+                            TOI_TRACE_DEBUG(pfn, "_Unmodified");
+                            num_unmodified++;
+                            continue;
+                        }
+
+                        if (PagePageset2(page)) {
+                                pagedir2.size++;
+                                TOI_TRACE_DEBUG(pfn, "_Flag PS2");
+                                if (PageHighMem(page))
+                                        inc_highmem_size(pagedir2);
+                                else
+                                        SetPagePageset1Copy(page);
+                                if (PageResave(page)) {
+                                        SetPagePageset1(page);
+                                        ClearPagePageset1Copy(page);
+                                        pagedir1.size++;
+                                        if (PageHighMem(page))
+                                                inc_highmem_size(pagedir1);
+                                }
+                        } else {
+                                pagedir1.size++;
+                                TOI_TRACE_DEBUG(pfn, "_Flag PS1");
+                                SetPagePageset1(page);
+                                if (PageHighMem(page))
+                                        inc_highmem_size(pagedir1);
+                        }
+                }
+        }
+
+        if (!atomic_copy)
+                toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0,
+                        "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)"
+                                    " + Unmodified (%d) + NumFree (%d) = %d.\n",
+                        pagedir1.size, pagedir2.size, num_nosave, num_unmodified,
+                        num_free, pagedir1.size + pagedir2.size + num_nosave + num_free);
+}
+
+void toi_recalculate_image_contents(int atomic_copy)
+{
+        memory_bm_clear(pageset1_map);
+        if (!atomic_copy) {
+                unsigned long pfn;
+                memory_bm_position_reset(pageset2_map);
+                for (pfn = memory_bm_next_pfn(pageset2_map, 0);
+                                pfn != BM_END_OF_MAP;
+                                pfn = memory_bm_next_pfn(pageset2_map, 0))
+                        ClearPagePageset1Copy(pfn_to_page(pfn));
+                /* Need to call this before getting pageset1_size! */
+                toi_mark_pages_for_pageset2();
+        }
+        memory_bm_position_reset(pageset2_map);
+        flag_image_pages(atomic_copy);
+
+        if (!atomic_copy) {
+                storage_limit = toiActiveAllocator->storage_available();
+                display_stats(0, 0);
+        }
+}
+
+int try_allocate_extra_memory(void)
+{
+        unsigned long wanted = pagedir1.size +  extra_pd1_pages_allowance -
+                get_lowmem_size(pagedir2);
+        if (wanted > extra_pages_allocated) {
+                unsigned long got = toi_allocate_extra_pagedir_memory(wanted);
+                if (wanted < got) {
+                        toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
+                                "Want %d extra pages for pageset1, got %d.\n",
+                                wanted, got);
+                        return 1;
+                }
+        }
+        return 0;
+}
+
+/* update_image
+ *
+ * Allocate [more] memory and storage for the image.
+ */
+static void update_image(int ps2_recalc)
+{
+        int old_header_req;
+        unsigned long seek;
+
+        if (try_allocate_extra_memory())
+                return;
+
+        if (ps2_recalc)
+                goto recalc;
+
+        thaw_kernel_threads();
+
+        /*
+         * Allocate remaining storage space, if possible, up to the
+         * maximum we know we'll need. It's okay to allocate the
+         * maximum if the writer is the swapwriter, but
+         * we don't want to grab all available space on an NFS share.
+         * We therefore ignore the expected compression ratio here,
+         * thereby trying to allocate the maximum image size we could
+         * need (assuming compression doesn't expand the image), but
+         * don't complain if we can't get the full amount we're after.
+         */
+
+        do {
+                int result;
+
+                old_header_req = header_storage_needed;
+                toiActiveAllocator->reserve_header_space(header_storage_needed);
+
+                /* How much storage is free with the reservation applied? */
+                storage_limit = toiActiveAllocator->storage_available();
+                seek = min(storage_limit, main_storage_needed(0, 0));
+
+                result = toiActiveAllocator->allocate_storage(seek);
+                if (result)
+                        printk("Failed to allocate storage (%d).\n", result);
+
+                main_storage_allocated =
+                        toiActiveAllocator->storage_allocated();
+
+                /* Need more header because more storage allocated? */
+                header_storage_needed = get_header_storage_needed(0);
+
+        } while (header_storage_needed > old_header_req);
+
+        if (freeze_kernel_threads())
+                set_abort_result(TOI_FREEZING_FAILED);
+
+recalc:
+        toi_recalculate_image_contents(0);
+}
+
+/* attempt_to_freeze
+ *
+ * Try to freeze processes.
+ */
+
+static int attempt_to_freeze(void)
+{
+        int result;
+
+        /* Stop processes before checking again */
+        toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing "
+                        "filesystems.");
+        result = freeze_processes();
+
+        if (result)
+                set_abort_result(TOI_FREEZING_FAILED);
+
+        result = freeze_kernel_threads();
+
+        if (result)
+                set_abort_result(TOI_FREEZING_FAILED);
+
+        return result;
+}
+
+/* eat_memory
+ *
+ * Try to free some memory, either to meet hard or soft constraints on the image
+ * characteristics.
+ *
+ * Hard constraints:
+ * - Pageset1 must be < half of memory;
+ * - We must have enough memory free at resume time to have pageset1
+ *   be able to be loaded in pages that don't conflict with where it has to
+ *   be restored.
+ * Soft constraints
+ * - User specificied image size limit.
+ */
+static void eat_memory(void)
+{
+        unsigned long amount_wanted = 0;
+        int did_eat_memory = 0;
+
+        /*
+         * Note that if we have enough storage space and enough free memory, we
+         * may exit without eating anything. We give up when the last 10
+         * iterations ate no extra pages because we're not going to get much
+         * more anyway, but the few pages we get will take a lot of time.
+         *
+         * We freeze processes before beginning, and then unfreeze them if we
+         * need to eat memory until we think we have enough. If our attempts
+         * to freeze fail, we give up and abort.
+         */
+
+        amount_wanted = amount_needed(1);
+
+        switch (image_size_limit) {
+        case -1: /* Don't eat any memory */
+                if (amount_wanted > 0) {
+                        set_abort_result(TOI_WOULD_EAT_MEMORY);
+                        return;
+                }
+                break;
+        case -2:  /* Free caches only */
+                drop_pagecache();
+                toi_recalculate_image_contents(0);
+                amount_wanted = amount_needed(1);
+                break;
+        default:
+                break;
+        }
+
+        if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) &&
+                        image_size_limit != -1) {
+                unsigned long request = amount_wanted;
+                unsigned long high_req = max(highpages_ps1_to_free(),
+                                any_to_free(1));
+                unsigned long low_req = lowpages_ps1_to_free();
+                unsigned long got = 0;
+
+                toi_prepare_status(CLEAR_BAR,
+                                "Seeking to free %ldMB of memory.",
+                                MB(amount_wanted));
+
+                thaw_kernel_threads();
+
+                /*
+                 * Ask for too many because shrink_memory_mask doesn't
+                 * currently return enough most of the time.
+                 */
+                
+                if (low_req)
+                        got = shrink_memory_mask(low_req, GFP_KERNEL);
+                if (high_req)
+                        shrink_memory_mask(high_req - got, GFP_HIGHUSER);
+
+                did_eat_memory = 1;
+
+                toi_recalculate_image_contents(0);
+
+                amount_wanted = amount_needed(1);
+
+                printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &"
+                                " %ld pages from anywhere, got %ld.\n",
+                                high_req, low_req,
+                                request - amount_wanted);
+
+                toi_cond_pause(0, NULL);
+
+                if (freeze_kernel_threads())
+                        set_abort_result(TOI_FREEZING_FAILED);
+        }
+
+        if (did_eat_memory)
+                toi_recalculate_image_contents(0);
+}
+
+/* toi_prepare_image
+ *
+ * Entry point to the whole image preparation section.
+ *
+ * We do four things:
+ * - Freeze processes;
+ * - Ensure image size constraints are met;
+ * - Complete all the preparation for saving the image,
+ *   including allocation of storage. The only memory
+ *   that should be needed when we're finished is that
+ *   for actually storing the image (and we know how
+ *   much is needed for that because the modules tell
+ *   us).
+ * - Make sure that all dirty buffers are written out.
+ */
+#define MAX_TRIES 2
+int toi_prepare_image(void)
+{
+        int result = 1, tries = 1;
+
+        main_storage_allocated = 0;
+
+        // Force recalculation of the amount of header storage needed for fs info.
+        fs_info_space_needed(1);
+
+        no_ps2_needed = 0;
+
+        if (attempt_to_freeze())
+                return 1;
+
+        lock_device_hotplug();
+        set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
+
+        if (!extra_pd1_pages_allowance)
+                get_extra_pd1_allowance();
+
+        storage_limit = toiActiveAllocator->storage_available();
+
+        if (!storage_limit) {
+                printk(KERN_INFO "No storage available. Didn't try to prepare "
+                                "an image.\n");
+                display_failure_reason(0);
+                set_abort_result(TOI_NOSTORAGE_AVAILABLE);
+                return 1;
+        }
+
+        if (build_attention_list()) {
+                abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
+                                "Unable to successfully prepare the image.\n");
+                return 1;
+        }
+
+        toi_recalculate_image_contents(0);
+
+        do {
+                toi_prepare_status(CLEAR_BAR,
+                                "Preparing Image. Try %d.", tries);
+
+                eat_memory();
+
+                if (test_result_state(TOI_ABORTED))
+                        break;
+
+                update_image(0);
+
+                tries++;
+
+        } while (image_not_ready(1) && tries <= MAX_TRIES &&
+                        !test_result_state(TOI_ABORTED));
+
+        result = image_not_ready(0);
+
+        /* TODO: Handle case where need to remove existing image and resave
+         * instead of adding to incremental image. */
+
+        if (!test_result_state(TOI_ABORTED)) {
+                if (result) {
+                        display_stats(1, 0);
+                        display_failure_reason(tries > MAX_TRIES);
+                        abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
+                                "Unable to successfully prepare the image.\n");
+                } else {
+                        /* Pageset 2 needed? */
+                        if (!need_pageset2() &&
+                                  test_action_state(TOI_NO_PS2_IF_UNNEEDED)) {
+                                no_ps2_needed = 1;
+                                toi_recalculate_image_contents(0);
+                                update_image(1);
+                        }
+
+                        toi_cond_pause(1, "Image preparation complete.");
+                }
+        }
+
+        return result ? result : allocate_checksum_pages();
+}
diff --git b/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h
new file mode 100644
index 0000000..f7f2b69
--- /dev/null
+++ b/kernel/power/tuxonice_prepare_image.h
@@ -0,0 +1,38 @@
+/*
+ * kernel/power/tuxonice_prepare_image.h
+ *
+ * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <asm/sections.h>
+
+extern int toi_prepare_image(void);
+extern void toi_recalculate_image_contents(int storage_available);
+extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask);
+extern long image_size_limit;
+extern void toi_free_extra_pagedir_memory(void);
+extern unsigned long extra_pd1_pages_allowance;
+extern void free_attention_list(void);
+
+#define MIN_FREE_RAM 100
+#define MIN_EXTRA_PAGES_ALLOWANCE 500
+
+#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1))
+#ifdef CONFIG_HIGHMEM
+#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM))
+#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \
+                                                (1 << ZONE_HIGHMEM)))
+#else
+#define real_nr_free_high_pages() (0)
+#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask))
+
+/* For eat_memory function */
+#define ZONE_HIGHMEM (MAX_NR_ZONES + 1)
+#endif
+
+unsigned long get_header_storage_needed(int reset);
+unsigned long any_to_free(int use_image_size_limit);
+int try_allocate_extra_memory(void);
diff --git b/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c
new file mode 100644
index 0000000..5bc56d3
--- /dev/null
+++ b/kernel/power/tuxonice_prune.c
@@ -0,0 +1,406 @@
+/*
+ * kernel/power/tuxonice_prune.c
+ *
+ * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file implements a TuxOnIce module that seeks to prune the
+ * amount of data written to disk. It builds a table of hashes
+ * of the uncompressed data, and writes the pfn of the previous page
+ * with the same contents instead of repeating the data when a match
+ * is found.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <crypto/hash.h>
+
+#include "tuxonice_builtin.h"
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_alloc.h"
+
+/*
+ * We never write a page bigger than PAGE_SIZE, so use a large number
+ * to indicate that data is a PFN.
+ */
+#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100)
+
+static unsigned long toi_pruned_pages;
+
+static struct toi_module_ops toi_prune_ops;
+static struct toi_module_ops *next_driver;
+
+static char toi_prune_hash_algo_name[32] = "sha1";
+
+static DEFINE_MUTEX(stats_lock);
+
+struct cpu_context {
+        struct shash_desc desc;
+        char *digest;
+};
+
+#define OUT_BUF_SIZE (2 * PAGE_SIZE)
+
+static DEFINE_PER_CPU(struct cpu_context, contexts);
+
+/*
+ * toi_crypto_prepare
+ *
+ * Prepare to do some work by allocating buffers and transforms.
+ */
+static int toi_prune_crypto_prepare(void)
+{
+        int cpu, ret, digestsize;
+
+        if (!*toi_prune_hash_algo_name) {
+                printk(KERN_INFO "TuxOnIce: Pruning enabled but no "
+                                "hash algorithm set.\n");
+                return 1;
+        }
+
+        for_each_online_cpu(cpu) {
+                struct cpu_context *this = &per_cpu(contexts, cpu);
+                this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0);
+                if (IS_ERR(this->desc.tfm)) {
+                        printk(KERN_INFO "TuxOnIce: Failed to allocate the "
+                                        "%s prune hash algorithm.\n",
+                                        toi_prune_hash_algo_name);
+                        this->desc.tfm = NULL;
+                        return 1;
+                }
+
+                if (!digestsize)
+                        digestsize = crypto_shash_digestsize(this->desc.tfm);
+
+                this->digest = kmalloc(digestsize, GFP_KERNEL);
+                if (!this->digest) {
+                        printk(KERN_INFO "TuxOnIce: Failed to allocate space "
+                                        "for digest output.\n");
+                        crypto_free_shash(this->desc.tfm);
+                        this->desc.tfm = NULL;
+                }
+
+                this->desc.flags = 0;
+
+                ret = crypto_shash_init(&this->desc);
+                if (ret < 0) {
+                        printk(KERN_INFO "TuxOnIce: Failed to initialise the "
+                                        "%s prune hash algorithm.\n",
+                                        toi_prune_hash_algo_name);
+                        kfree(this->digest);
+                        this->digest = NULL;
+                        crypto_free_shash(this->desc.tfm);
+                        this->desc.tfm = NULL;
+                        return 1;
+                }
+        }
+
+        return 0;
+}
+
+static int toi_prune_rw_cleanup(int writing)
+{
+        int cpu;
+
+        for_each_online_cpu(cpu) {
+                struct cpu_context *this = &per_cpu(contexts, cpu);
+                if (this->desc.tfm) {
+                        crypto_free_shash(this->desc.tfm);
+                        this->desc.tfm = NULL;
+                }
+
+                if (this->digest) {
+                        kfree(this->digest);
+                        this->digest = NULL;
+                }
+        }
+
+        return 0;
+}
+
+/*
+ * toi_prune_init
+ */
+
+static int toi_prune_init(int toi_or_resume)
+{
+        if (!toi_or_resume)
+                return 0;
+
+        toi_pruned_pages = 0;
+
+        next_driver = toi_get_next_filter(&toi_prune_ops);
+
+        return next_driver ? 0 : -ECHILD;
+}
+
+/*
+ * toi_prune_rw_init()
+ */
+
+static int toi_prune_rw_init(int rw, int stream_number)
+{
+        if (toi_prune_crypto_prepare()) {
+                printk(KERN_ERR "Failed to initialise prune "
+                                "algorithm.\n");
+                if (rw == READ) {
+                        printk(KERN_INFO "Unable to read the image.\n");
+                        return -ENODEV;
+                } else {
+                        printk(KERN_INFO "Continuing without "
+                                "pruning the image.\n");
+                        toi_prune_ops.enabled = 0;
+                }
+        }
+
+        return 0;
+}
+
+/*
+ * toi_prune_write_page()
+ *
+ * Compress a page of data, buffering output and passing on filled
+ * pages to the next module in the pipeline.
+ *
+ * Buffer_page:        Pointer to a buffer of size PAGE_SIZE, containing
+ * data to be checked.
+ *
+ * Returns:        0 on success. Otherwise the error is that returned by later
+ *                 modules, -ECHILD if we have a broken pipeline or -EIO if
+ *                 zlib errs.
+ */
+static int toi_prune_write_page(unsigned long index, int buf_type,
+                void *buffer_page, unsigned int buf_size)
+{
+        int ret = 0, cpu = smp_processor_id(), write_data = 1;
+        struct cpu_context *ctx = &per_cpu(contexts, cpu);
+        u8* output_buffer = buffer_page;
+        int output_len = buf_size;
+        int out_buf_type = buf_type;
+        void *buffer_start;
+        u32 buf[4];
+
+        if (ctx->desc.tfm) {
+
+                buffer_start = TOI_MAP(buf_type, buffer_page);
+                ctx->len = OUT_BUF_SIZE;
+
+                ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest);
+                if (ret) {
+                        printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret);
+                } else {
+                        mutex_lock(&stats_lock);
+
+                        toi_pruned_pages++;
+
+                        mutex_unlock(&stats_lock);
+
+                }
+
+                TOI_UNMAP(buf_type, buffer_page);
+        }
+
+        if (write_data)
+                ret = next_driver->write_page(index, out_buf_type,
+                                output_buffer, output_len);
+        else
+                ret = next_driver->write_page(index, out_buf_type,
+                                output_buffer, output_len);
+
+        return ret;
+}
+
+/*
+ * toi_prune_read_page()
+ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Retrieve data from later modules or from a previously loaded page and
+ * fill the input buffer.
+ * Zero if successful. Error condition from me or from downstream on failure.
+ */
+static int toi_prune_read_page(unsigned long *index, int buf_type,
+                void *buffer_page, unsigned int *buf_size)
+{
+        int ret, cpu = smp_processor_id();
+        unsigned int len;
+        char *buffer_start;
+        struct cpu_context *ctx = &per_cpu(contexts, cpu);
+
+        if (!ctx->desc.tfm)
+                return next_driver->read_page(index, TOI_PAGE, buffer_page,
+                                buf_size);
+
+        /*
+         * All our reads must be synchronous - we can't handle
+         * data that hasn't been read yet.
+         */
+
+        ret = next_driver->read_page(index, buf_type, buffer_page, &len);
+
+        if (len == PRUNE_DATA_IS_PFN) {
+                buffer_start = kmap(buffer_page);
+        }
+
+        return ret;
+}
+
+/*
+ * toi_prune_print_debug_stats
+ * @buffer: Pointer to a buffer into which the debug info will be printed.
+ * @size: Size of the buffer.
+ *
+ * Print information to be recorded for debugging purposes into a buffer.
+ * Returns: Number of characters written to the buffer.
+ */
+
+static int toi_prune_print_debug_stats(char *buffer, int size)
+{
+        int len;
+
+        /* Output the number of pages pruned. */
+        if (*toi_prune_hash_algo_name)
+                len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
+                                toi_prune_hash_algo_name);
+        else
+                len = scnprintf(buffer, size, "- Compressor is not set.\n");
+
+        if (toi_pruned_pages)
+                len += scnprintf(buffer+len, size - len, "  Pruned "
+                        "%lu pages).\n",
+                  toi_pruned_pages);
+        return len;
+}
+
+/*
+ * toi_prune_memory_needed
+ *
+ * Tell the caller how much memory we need to operate during hibernate/resume.
+ * Returns: Unsigned long. Maximum number of bytes of memory required for
+ * operation.
+ */
+static int toi_prune_memory_needed(void)
+{
+        return 2 * PAGE_SIZE;
+}
+
+static int toi_prune_storage_needed(void)
+{
+        return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
+                strlen(toi_prune_hash_algo_name) + 1;
+}
+
+/*
+ * toi_prune_save_config_info
+ * @buffer: Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Save informaton needed when reloading the image at resume time.
+ * Returns: Number of bytes used for saving our data.
+ */
+static int toi_prune_save_config_info(char *buffer)
+{
+        int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0;
+
+        *((unsigned long *) buffer) = toi_pruned_pages;
+        offset += sizeof(unsigned long);
+        *((int *) (buffer + offset)) = len;
+        offset += sizeof(int);
+        strncpy(buffer + offset, toi_prune_hash_algo_name, len);
+        return offset + len;
+}
+
+/* toi_prune_load_config_info
+ * @buffer: Pointer to the start of the data.
+ * @size: Number of bytes that were saved.
+ *
+ * Description:        Reload information needed for passing back to the
+ * resumed kernel.
+ */
+static void toi_prune_load_config_info(char *buffer, int size)
+{
+        int len, offset = 0;
+
+        toi_pruned_pages = *((unsigned long *) buffer);
+        offset += sizeof(unsigned long);
+        len = *((int *) (buffer + offset));
+        offset += sizeof(int);
+        strncpy(toi_prune_hash_algo_name, buffer + offset, len);
+}
+
+static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
+{
+        bkd->pruned_pages = toi_pruned_pages;
+}
+
+static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd)
+{
+        toi_pruned_pages = bkd->pruned_pages;
+}
+
+/*
+ * toi_expected_ratio
+ *
+ * Description:        Returns the expected ratio between data passed into this module
+ *                 and the amount of data output when writing.
+ * Returns:        100 - we have no idea how many pages will be pruned.
+ */
+
+static int toi_prune_expected_ratio(void)
+{
+        return 100;
+}
+
+/*
+ * data for our sysfs entries.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0,
+                        NULL),
+        SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL),
+};
+
+/*
+ * Ops structure.
+ */
+static struct toi_module_ops toi_prune_ops = {
+        .type                        = FILTER_MODULE,
+        .name                        = "prune",
+        .directory                = "prune",
+        .module                        = THIS_MODULE,
+        .initialise                = toi_prune_init,
+        .memory_needed                 = toi_prune_memory_needed,
+        .print_debug_info        = toi_prune_print_debug_stats,
+        .save_config_info        = toi_prune_save_config_info,
+        .load_config_info        = toi_prune_load_config_info,
+        .storage_needed                = toi_prune_storage_needed,
+        .expected_compression        = toi_prune_expected_ratio,
+
+        .pre_atomic_restore        = toi_prune_pre_atomic_restore,
+        .post_atomic_restore        = toi_prune_post_atomic_restore,
+
+        .rw_init                = toi_prune_rw_init,
+        .rw_cleanup                = toi_prune_rw_cleanup,
+
+        .write_page                = toi_prune_write_page,
+        .read_page                = toi_prune_read_page,
+
+        .sysfs_data                = sysfs_params,
+        .num_sysfs_entries        = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+
+static __init int toi_prune_load(void)
+{
+        return toi_register_module(&toi_prune_ops);
+}
+
+late_initcall(toi_prune_load);
diff --git b/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
new file mode 100644
index 0000000..d8539c2
--- /dev/null
+++ b/kernel/power/tuxonice_storage.c
@@ -0,0 +1,282 @@
+/*
+ * kernel/power/tuxonice_storage.c
+ *
+ * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for talking to a userspace program that manages storage.
+ *
+ * The kernel side:
+ * - starts the userspace program;
+ * - sends messages telling it when to open and close the connection;
+ * - tells it when to quit;
+ *
+ * The user space side:
+ * - passes messages regarding status;
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_netlink.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_ui.h"
+
+static struct user_helper_data usm_helper_data;
+static struct toi_module_ops usm_ops;
+static int message_received, usm_prepare_count;
+static int storage_manager_last_action, storage_manager_action;
+
+static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+        int type;
+        int *data;
+
+        type = nlh->nlmsg_type;
+
+        /* A control message: ignore them */
+        if (type < NETLINK_MSG_BASE)
+                return 0;
+
+        /* Unknown message: reply with EINVAL */
+        if (type >= USM_MSG_MAX)
+                return -EINVAL;
+
+        /* All operations require privileges, even GET */
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+
+        /* Only allow one task to receive NOFREEZE privileges */
+        if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
+                return -EBUSY;
+
+        data = (int *) NLMSG_DATA(nlh);
+
+        switch (type) {
+        case USM_MSG_SUCCESS:
+        case USM_MSG_FAILED:
+                message_received = type;
+                complete(&usm_helper_data.wait_for_process);
+                break;
+        default:
+                printk(KERN_INFO "Storage manager doesn't recognise "
+                                "message %d.\n", type);
+        }
+
+        return 1;
+}
+
+#ifdef CONFIG_NET
+static int activations;
+
+int toi_activate_storage(int force)
+{
+        int tries = 1;
+
+        if (usm_helper_data.pid == -1 || !usm_ops.enabled)
+                return 0;
+
+        message_received = 0;
+        activations++;
+
+        if (activations > 1 && !force)
+                return 0;
+
+        while ((!message_received || message_received == USM_MSG_FAILED) &&
+                        tries < 2) {
+                toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
+                                "%d.\n", tries);
+
+                init_completion(&usm_helper_data.wait_for_process);
+
+                toi_send_netlink_message(&usm_helper_data,
+                        USM_MSG_CONNECT,
+                        NULL, 0);
+
+                /* Wait 2 seconds for the userspace process to make contact */
+                wait_for_completion_timeout(&usm_helper_data.wait_for_process,
+                                2*HZ);
+
+                tries++;
+        }
+
+        return 0;
+}
+
+int toi_deactivate_storage(int force)
+{
+        if (usm_helper_data.pid == -1 || !usm_ops.enabled)
+                return 0;
+
+        message_received = 0;
+        activations--;
+
+        if (activations && !force)
+                return 0;
+
+        init_completion(&usm_helper_data.wait_for_process);
+
+        toi_send_netlink_message(&usm_helper_data,
+                        USM_MSG_DISCONNECT,
+                        NULL, 0);
+
+        wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
+
+        if (!message_received || message_received == USM_MSG_FAILED) {
+                printk(KERN_INFO "Returning failure disconnecting storage.\n");
+                return 1;
+        }
+
+        return 0;
+}
+#endif
+
+static void storage_manager_simulate(void)
+{
+        printk(KERN_INFO "--- Storage manager simulate ---\n");
+        toi_prepare_usm();
+        schedule();
+        printk(KERN_INFO "--- Activate storage 1 ---\n");
+        toi_activate_storage(1);
+        schedule();
+        printk(KERN_INFO "--- Deactivate storage 1 ---\n");
+        toi_deactivate_storage(1);
+        schedule();
+        printk(KERN_INFO "--- Cleanup usm ---\n");
+        toi_cleanup_usm();
+        schedule();
+        printk(KERN_INFO "--- Storage manager simulate ends ---\n");
+}
+
+static int usm_storage_needed(void)
+{
+        return sizeof(int) + strlen(usm_helper_data.program) + 1;
+}
+
+static int usm_save_config_info(char *buf)
+{
+        int len = strlen(usm_helper_data.program);
+        memcpy(buf, usm_helper_data.program, len + 1);
+        return sizeof(int) + len + 1;
+}
+
+static void usm_load_config_info(char *buf, int size)
+{
+        /* Don't load the saved path if one has already been set */
+        if (usm_helper_data.program[0])
+                return;
+
+        memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf));
+}
+
+static int usm_memory_needed(void)
+{
+        /* ball park figure of 32 pages */
+        return 32 * PAGE_SIZE;
+}
+
+/* toi_prepare_usm
+ */
+int toi_prepare_usm(void)
+{
+        usm_prepare_count++;
+
+        if (usm_prepare_count > 1 || !usm_ops.enabled)
+                return 0;
+
+        usm_helper_data.pid = -1;
+
+        if (!*usm_helper_data.program)
+                return 0;
+
+        toi_netlink_setup(&usm_helper_data);
+
+        if (usm_helper_data.pid == -1)
+                printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
+                                " start it.\n");
+
+        toi_activate_storage(0);
+
+        return usm_helper_data.pid != -1;
+}
+
+void toi_cleanup_usm(void)
+{
+        usm_prepare_count--;
+
+        if (usm_helper_data.pid > -1 && !usm_prepare_count) {
+                toi_deactivate_storage(0);
+                toi_netlink_close(&usm_helper_data);
+        }
+}
+
+static void storage_manager_activate(void)
+{
+        if (storage_manager_action == storage_manager_last_action)
+                return;
+
+        if (storage_manager_action)
+                toi_prepare_usm();
+        else
+                toi_cleanup_usm();
+
+        storage_manager_last_action = storage_manager_action;
+}
+
+/*
+ * User interface specific /sys/power/tuxonice entries.
+ */
+
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate),
+        SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL),
+        SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0,
+                NULL),
+        SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1,
+                        0, storage_manager_activate)
+};
+
+static struct toi_module_ops usm_ops = {
+        .type                                = MISC_MODULE,
+        .name                                = "usm",
+        .directory                        = "storage_manager",
+        .module                                = THIS_MODULE,
+        .storage_needed                        = usm_storage_needed,
+        .save_config_info                = usm_save_config_info,
+        .load_config_info                = usm_load_config_info,
+        .memory_needed                        = usm_memory_needed,
+
+        .sysfs_data                        = sysfs_params,
+        .num_sysfs_entries                = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+/* toi_usm_sysfs_init
+ * Description: Boot time initialisation for user interface.
+ */
+int toi_usm_init(void)
+{
+        usm_helper_data.nl = NULL;
+        usm_helper_data.program[0] = '\0';
+        usm_helper_data.pid = -1;
+        usm_helper_data.skb_size = 0;
+        usm_helper_data.pool_limit = 6;
+        usm_helper_data.netlink_id = NETLINK_TOI_USM;
+        usm_helper_data.name = "userspace storage manager";
+        usm_helper_data.rcv_msg = usm_user_rcv_msg;
+        usm_helper_data.interface_version = 2;
+        usm_helper_data.must_init = 0;
+        init_completion(&usm_helper_data.wait_for_process);
+
+        return toi_register_module(&usm_ops);
+}
+
+void toi_usm_exit(void)
+{
+        toi_netlink_close_complete(&usm_helper_data);
+        toi_unregister_module(&usm_ops);
+}
diff --git b/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
new file mode 100644
index 0000000..0189c88
--- /dev/null
+++ b/kernel/power/tuxonice_storage.h
@@ -0,0 +1,45 @@
+/*
+ * kernel/power/tuxonice_storage.h
+ *
+ * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifdef CONFIG_NET
+int toi_prepare_usm(void);
+void toi_cleanup_usm(void);
+
+int toi_activate_storage(int force);
+int toi_deactivate_storage(int force);
+extern int toi_usm_init(void);
+extern void toi_usm_exit(void);
+#else
+static inline int toi_usm_init(void) { return 0; }
+static inline void toi_usm_exit(void) { }
+
+static inline int toi_activate_storage(int force)
+{
+        return 0;
+}
+
+static inline int toi_deactivate_storage(int force)
+{
+        return 0;
+}
+
+static inline int toi_prepare_usm(void) { return 0; }
+static inline void toi_cleanup_usm(void) { }
+#endif
+
+enum {
+        USM_MSG_BASE = 0x10,
+
+        /* Kernel -> Userspace */
+        USM_MSG_CONNECT = 0x30,
+        USM_MSG_DISCONNECT = 0x31,
+        USM_MSG_SUCCESS = 0x40,
+        USM_MSG_FAILED = 0x41,
+
+        USM_MSG_MAX,
+};
diff --git b/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
new file mode 100644
index 0000000..9f555c9
--- /dev/null
+++ b/kernel/power/tuxonice_swap.c
@@ -0,0 +1,474 @@
+/*
+ * kernel/power/tuxonice_swap.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file encapsulates functions for usage of swap space as a
+ * backing store.
+ */
+
+#include <linux/suspend.h>
+#include <linux/blkdev.h>
+#include <linux/swapops.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/fs_uuid.h>
+
+#include "tuxonice.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_bio.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_builtin.h"
+
+static struct toi_module_ops toi_swapops;
+
+/* For swapfile automatically swapon/off'd. */
+static char swapfilename[255] = "";
+static int toi_swapon_status;
+
+/* Swap Pages */
+static unsigned long swap_allocated;
+
+static struct sysinfo swapinfo;
+
+static int is_ram_backed(struct swap_info_struct *si)
+{
+        if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) ||
+            !strncmp(si->bdev->bd_disk->disk_name, "zram", 4))
+                return 1;
+
+        return 0;
+}
+
+/**
+ * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
+ *
+ * Activate the given swapfile if it wasn't already enabled. Remember whether
+ * we really did swapon it for swapoffing later.
+ */
+static void enable_swapfile(void)
+{
+        int activateswapresult = -EINVAL;
+
+        if (swapfilename[0]) {
+                /* Attempt to swap on with maximum priority */
+                activateswapresult = sys_swapon(swapfilename, 0xFFFF);
+                if (activateswapresult && activateswapresult != -EBUSY)
+                        printk(KERN_ERR "TuxOnIce: The swapfile/partition "
+                                "specified by /sys/power/tuxonice/swap/swapfile"
+                                " (%s) could not be turned on (error %d). "
+                                "Attempting to continue.\n",
+                                swapfilename, activateswapresult);
+                if (!activateswapresult)
+                        toi_swapon_status = 1;
+        }
+}
+
+/**
+ * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
+ *
+ * If we did successfully swapon a file at the start of the cycle, swapoff
+ * it now (finishing up).
+ */
+static void disable_swapfile(void)
+{
+        if (!toi_swapon_status)
+                return;
+
+        sys_swapoff(swapfilename);
+        toi_swapon_status = 0;
+}
+
+static int add_blocks_to_extent_chain(struct toi_bdev_info *chain,
+                unsigned long start, unsigned long end)
+{
+        if (test_action_state(TOI_TEST_BIO))
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to "
+                                "chain %p.", start << chain->bmap_shift,
+                                end << chain->bmap_shift, chain);
+
+        return toi_add_to_extent_chain(&chain->blocks, start, end);
+}
+
+
+static int get_main_pool_phys_params(struct toi_bdev_info *chain)
+{
+        struct hibernate_extent *extentpointer = NULL;
+        unsigned long address, extent_min = 0, extent_max = 0;
+        int empty = 1;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for "
+                        "chain %d.", chain->allocator_index);
+
+        if (!chain->allocations.first)
+                return 0;
+
+        if (chain->blocks.first)
+                toi_put_extent_chain(&chain->blocks);
+
+        toi_extent_for_each(&chain->allocations, extentpointer, address) {
+                swp_entry_t swap_address = (swp_entry_t) { address };
+                struct block_device *bdev;
+                sector_t new_sector = map_swap_entry(swap_address, &bdev);
+
+                if (empty) {
+                        empty = 0;
+                        extent_min = extent_max = new_sector;
+                        continue;
+                }
+
+                if (new_sector == extent_max + 1) {
+                        extent_max++;
+                        continue;
+                }
+
+                if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
+                        printk(KERN_ERR "Out of memory while making block "
+                                        "chains.\n");
+                        return -ENOMEM;
+                }
+
+                extent_min = new_sector;
+                extent_max = new_sector;
+        }
+
+        if (!empty &&
+            add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
+                printk(KERN_ERR "Out of memory while making block chains.\n");
+                return -ENOMEM;
+        }
+
+        return 0;
+}
+
+/*
+ * Like si_swapinfo, except that we don't include ram backed swap (compcache!)
+ * and don't need to use the spinlocks (userspace is stopped when this
+ * function is called).
+ */
+void si_swapinfo_no_compcache(void)
+{
+        unsigned int i;
+
+        si_swapinfo(&swapinfo);
+        swapinfo.freeswap = 0;
+        swapinfo.totalswap = 0;
+
+        for (i = 0; i < MAX_SWAPFILES; i++) {
+                struct swap_info_struct *si = get_swap_info_struct(i);
+                if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) {
+                        swapinfo.totalswap += si->inuse_pages;
+                        swapinfo.freeswap += si->pages - si->inuse_pages;
+                }
+        }
+}
+/*
+ * We can't just remember the value from allocation time, because other
+ * processes might have allocated swap in the mean time.
+ */
+static unsigned long toi_swap_storage_available(void)
+{
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available.");
+        si_swapinfo_no_compcache();
+        return swapinfo.freeswap + swap_allocated;
+}
+
+static int toi_swap_initialise(int starting_cycle)
+{
+        if (!starting_cycle)
+                return 0;
+
+        enable_swapfile();
+        return 0;
+}
+
+static void toi_swap_cleanup(int ending_cycle)
+{
+        if (!ending_cycle)
+                return;
+
+        disable_swapfile();
+}
+
+static void toi_swap_free_storage(struct toi_bdev_info *chain)
+{
+        /* Free swap entries */
+        struct hibernate_extent *extentpointer;
+        unsigned long extentvalue;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.",
+                        chain);
+
+        swap_allocated -= chain->allocations.size;
+        toi_extent_for_each(&chain->allocations, extentpointer, extentvalue)
+                swap_free((swp_entry_t) { extentvalue });
+
+        toi_put_extent_chain(&chain->allocations);
+}
+
+static void free_swap_range(unsigned long min, unsigned long max)
+{
+        int j;
+
+        for (j = min; j <= max; j++)
+                swap_free((swp_entry_t) { j });
+        swap_allocated -= (max - min + 1);
+}
+
+/*
+ * Allocation of a single swap type. Swap priorities are handled at the higher
+ * level.
+ */
+static int toi_swap_allocate_storage(struct toi_bdev_info *chain,
+                unsigned long request)
+{
+        unsigned long gotten = 0;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "  Swap allocate storage: Asked to"
+                        " allocate %lu pages from device %d.", request,
+                        chain->allocator_index);
+
+        while (gotten < request) {
+                swp_entry_t start, end;
+                if (0) {
+                    /* Broken at the moment for SSDs */
+                    get_swap_range_of_type(chain->allocator_index, &start, &end,
+                            request - gotten + 1);
+                } else {
+                    start = end = get_swap_page_of_type(chain->allocator_index);
+                }
+                if (start.val) {
+                        int added = end.val - start.val + 1;
+                        if (toi_add_to_extent_chain(&chain->allocations,
+                                                start.val, end.val)) {
+                                printk(KERN_INFO "Failed to allocate extent for "
+                                        "%lu-%lu.\n", start.val, end.val);
+                                free_swap_range(start.val, end.val);
+                                break;
+                        }
+                        gotten += added;
+                        swap_allocated += added;
+                } else
+                        break;
+        }
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "  Allocated %lu pages.", gotten);
+        return gotten;
+}
+
+static int toi_swap_register_storage(void)
+{
+        int i, result = 0;
+
+        toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage.");
+        for (i = 0; i < MAX_SWAPFILES; i++) {
+                struct swap_info_struct *si = get_swap_info_struct(i);
+                struct toi_bdev_info *devinfo;
+                unsigned char *p;
+                unsigned char buf[256];
+                struct fs_info *fs_info;
+
+                if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si))
+                        continue;
+
+                devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info),
+                                GFP_ATOMIC);
+                if (!devinfo) {
+                        printk("Failed to allocate devinfo struct for swap "
+                                        "device %d.\n", i);
+                        return -ENOMEM;
+                }
+
+                devinfo->bdev = si->bdev;
+                devinfo->allocator = &toi_swapops;
+                devinfo->allocator_index = i;
+
+                fs_info = fs_info_from_block_dev(si->bdev);
+                if (fs_info && !IS_ERR(fs_info)) {
+                        memcpy(devinfo->uuid, &fs_info->uuid, 16);
+                        free_fs_info(fs_info);
+                } else
+                        result = (int) PTR_ERR(fs_info);
+
+                if (!fs_info)
+                        printk("fs_info from block dev returned %d.\n", result);
+                devinfo->dev_t = si->bdev->bd_dev;
+                devinfo->prio = si->prio;
+                devinfo->bmap_shift = 3;
+                devinfo->blocks_per_page = 1;
+
+                p = d_path(&si->swap_file->f_path, buf, sizeof(buf));
+                sprintf(devinfo->name, "swap on %s", p);
+
+                toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:"
+                                " Device %d (%lx), prio %d.", i,
+                                (unsigned long) devinfo->dev_t, devinfo->prio);
+                toi_bio_ops.register_storage(devinfo);
+        }
+
+        return 0;
+}
+
+static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used)
+{
+    struct hibernate_extent *extentpointer = NULL;
+    unsigned long extentvalue;
+    unsigned long i = 0, first_freed = 0;
+
+    toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) {
+        i++;
+        if (i > used) {
+            swap_free((swp_entry_t) { extentvalue });
+            if (!first_freed)
+                first_freed = extentvalue;
+        }
+    }
+
+    return first_freed;
+}
+
+/*
+ * workspace_size
+ *
+ * Description:
+ * Returns the number of bytes of RAM needed for this
+ * code to do its work. (Used when calculating whether
+ * we have enough memory to be able to hibernate & resume).
+ *
+ */
+static int toi_swap_memory_needed(void)
+{
+        return 1;
+}
+
+/*
+ * Print debug info
+ *
+ * Description:
+ */
+static int toi_swap_print_debug_stats(char *buffer, int size)
+{
+        int len = 0;
+
+        len = scnprintf(buffer, size, "- Swap Allocator enabled.\n");
+        if (swapfilename[0])
+                len += scnprintf(buffer+len, size-len,
+                        "  Attempting to automatically swapon: %s.\n",
+                        swapfilename);
+
+        si_swapinfo_no_compcache();
+
+        len += scnprintf(buffer+len, size-len,
+                        "  Swap available for image: %lu pages.\n",
+                        swapinfo.freeswap + swap_allocated);
+
+        return len;
+}
+
+static int header_locations_read_sysfs(const char *page, int count)
+{
+        int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
+        struct inode *swapf = NULL;
+        int zone;
+        char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
+        char *path, *output = (char *) page;
+        int path_len;
+
+        if (!page)
+                return 0;
+
+        for (i = 0; i < MAX_SWAPFILES; i++) {
+                struct swap_info_struct *si =  get_swap_info_struct(i);
+
+                if (!si || !(si->flags & SWP_WRITEOK))
+                        continue;
+
+                if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
+                        haveswap = 1;
+                        if (!printedpartitionsmessage) {
+                                len += sprintf(output + len,
+                                        "For swap partitions, simply use the "
+                                        "format: resume=swap:/dev/hda1.\n");
+                                printedpartitionsmessage = 1;
+                        }
+                } else {
+                        path_len = 0;
+
+                        path = d_path(&si->swap_file->f_path, path_page,
+                                        PAGE_SIZE);
+                        path_len = snprintf(path_page, PAGE_SIZE, "%s", path);
+
+                        haveswap = 1;
+                        swapf = si->swap_file->f_mapping->host;
+                        zone = bmap(swapf, 0);
+                        if (!zone) {
+                                len += sprintf(output + len,
+                                        "Swapfile %s has been corrupted. Reuse"
+                                        " mkswap on it and try again.\n",
+                                        path_page);
+                        } else {
+                                char name_buffer[BDEVNAME_SIZE];
+                                len += sprintf(output + len,
+                                        "For swapfile `%s`,"
+                                        " use resume=swap:/dev/%s:0x%x.\n",
+                                        path_page,
+                                        bdevname(si->bdev, name_buffer),
+                                        zone << (swapf->i_blkbits - 9));
+                        }
+                }
+        }
+
+        if (!haveswap)
+                len = sprintf(output, "You need to turn on swap partitions "
+                                "before examining this file.\n");
+
+        toi_free_page(10, (unsigned long) path_page);
+        return len;
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL),
+        SYSFS_CUSTOM("headerlocations", SYSFS_READONLY,
+                        header_locations_read_sysfs, NULL, 0, NULL),
+        SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0,
+                        attempt_to_parse_resume_device2),
+};
+
+static struct toi_bio_allocator_ops toi_bio_swapops = {
+        .register_storage                        = toi_swap_register_storage,
+        .storage_available                        = toi_swap_storage_available,
+        .allocate_storage                        = toi_swap_allocate_storage,
+        .bmap                                        = get_main_pool_phys_params,
+        .free_storage                                = toi_swap_free_storage,
+        .free_unused_storage                    = toi_swap_free_unused_storage,
+};
+
+static struct toi_module_ops toi_swapops = {
+        .type                                        = BIO_ALLOCATOR_MODULE,
+        .name                                        = "swap storage",
+        .directory                                = "swap",
+        .module                                        = THIS_MODULE,
+        .memory_needed                                = toi_swap_memory_needed,
+        .print_debug_info                        = toi_swap_print_debug_stats,
+        .initialise                                = toi_swap_initialise,
+        .cleanup                                = toi_swap_cleanup,
+        .bio_allocator_ops                        = &toi_bio_swapops,
+
+        .sysfs_data                = sysfs_params,
+        .num_sysfs_entries        = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+static __init int toi_swap_load(void)
+{
+        return toi_register_module(&toi_swapops);
+}
+
+late_initcall(toi_swap_load);
diff --git b/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c
new file mode 100644
index 0000000..77f36db
--- /dev/null
+++ b/kernel/power/tuxonice_sysfs.c
@@ -0,0 +1,333 @@
+/*
+ * kernel/power/tuxonice_sysfs.c
+ *
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains support for sysfs entries for tuning TuxOnIce.
+ *
+ * We have a generic handler that deals with the most common cases, and
+ * hooks for special handlers to use.
+ */
+
+#include <linux/suspend.h>
+
+#include "tuxonice_sysfs.h"
+#include "tuxonice.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_alloc.h"
+
+static int toi_sysfs_initialised;
+
+static void toi_initialise_sysfs(void);
+
+static struct toi_sysfs_data sysfs_params[];
+
+#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr)
+
+static void toi_main_wrapper(void)
+{
+        toi_try_hibernate();
+}
+
+static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *page)
+{
+        struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
+        int len = 0;
+        int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ;
+
+        if (full_prep && toi_start_anything(0))
+                return -EBUSY;
+
+        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
+                toi_prepare_usm();
+
+        switch (sysfs_data->type) {
+        case TOI_SYSFS_DATA_CUSTOM:
+                len = (sysfs_data->data.special.read_sysfs) ?
+                        (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE)
+                        : 0;
+                break;
+        case TOI_SYSFS_DATA_BIT:
+                len = sprintf(page, "%d\n",
+                        -test_bit(sysfs_data->data.bit.bit,
+                                sysfs_data->data.bit.bit_vector));
+                break;
+        case TOI_SYSFS_DATA_INTEGER:
+                len = sprintf(page, "%d\n",
+                        *(sysfs_data->data.integer.variable));
+                break;
+        case TOI_SYSFS_DATA_LONG:
+                len = sprintf(page, "%ld\n",
+                        *(sysfs_data->data.a_long.variable));
+                break;
+        case TOI_SYSFS_DATA_UL:
+                len = sprintf(page, "%lu\n",
+                        *(sysfs_data->data.ul.variable));
+                break;
+        case TOI_SYSFS_DATA_STRING:
+                len = sprintf(page, "%s\n",
+                        sysfs_data->data.string.variable);
+                break;
+        }
+
+        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
+                toi_cleanup_usm();
+
+        if (full_prep)
+                toi_finish_anything(0);
+
+        return len;
+}
+
+#define BOUND(_variable, _type) do { \
+        if (*_variable < sysfs_data->data._type.minimum) \
+                *_variable = sysfs_data->data._type.minimum; \
+        else if (*_variable > sysfs_data->data._type.maximum) \
+                *_variable = sysfs_data->data._type.maximum; \
+} while (0)
+
+static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr,
+                const char *my_buf, size_t count)
+{
+        int assigned_temp_buffer = 0, result = count;
+        struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
+
+        if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME)))
+                return -EBUSY;
+
+        ((char *) my_buf)[count] = 0;
+
+        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
+                toi_prepare_usm();
+
+        switch (sysfs_data->type) {
+        case TOI_SYSFS_DATA_CUSTOM:
+                if (sysfs_data->data.special.write_sysfs)
+                        result = (sysfs_data->data.special.write_sysfs)(my_buf,
+                                        count);
+                break;
+        case TOI_SYSFS_DATA_BIT:
+                {
+                unsigned long value;
+                result = kstrtoul(my_buf, 0, &value);
+                if (result)
+                        break;
+                if (value)
+                        set_bit(sysfs_data->data.bit.bit,
+                                (sysfs_data->data.bit.bit_vector));
+                else
+                        clear_bit(sysfs_data->data.bit.bit,
+                                (sysfs_data->data.bit.bit_vector));
+                }
+                break;
+        case TOI_SYSFS_DATA_INTEGER:
+                {
+                        long temp;
+                        result = kstrtol(my_buf, 0, &temp);
+                        if (result)
+                                break;
+                        *(sysfs_data->data.integer.variable) = (int) temp;
+                        BOUND(sysfs_data->data.integer.variable, integer);
+                        break;
+                }
+        case TOI_SYSFS_DATA_LONG:
+                {
+                        long *variable =
+                                sysfs_data->data.a_long.variable;
+                        result = kstrtol(my_buf, 0, variable);
+                        if (result)
+                                break;
+                        BOUND(variable, a_long);
+                        break;
+                }
+        case TOI_SYSFS_DATA_UL:
+                {
+                        unsigned long *variable =
+                                sysfs_data->data.ul.variable;
+                        result = kstrtoul(my_buf, 0, variable);
+                        if (result)
+                                break;
+                        BOUND(variable, ul);
+                        break;
+                }
+                break;
+        case TOI_SYSFS_DATA_STRING:
+                {
+                        int copy_len = count;
+                        char *variable =
+                                sysfs_data->data.string.variable;
+
+                        if (sysfs_data->data.string.max_length &&
+                            (copy_len > sysfs_data->data.string.max_length))
+                                copy_len = sysfs_data->data.string.max_length;
+
+                        if (!variable) {
+                                variable = (char *) toi_get_zeroed_page(31,
+                                                TOI_ATOMIC_GFP);
+                                sysfs_data->data.string.variable = variable;
+                                assigned_temp_buffer = 1;
+                        }
+                        strncpy(variable, my_buf, copy_len);
+                        if (copy_len && my_buf[copy_len - 1] == '\n')
+                                variable[count - 1] = 0;
+                        variable[count] = 0;
+                }
+                break;
+        }
+
+        if (!result)
+                result = count;
+
+        /* Side effect routine? */
+        if (result == count && sysfs_data->write_side_effect)
+                sysfs_data->write_side_effect();
+
+        /* Free temporary buffers */
+        if (assigned_temp_buffer) {
+                toi_free_page(31,
+                        (unsigned long) sysfs_data->data.string.variable);
+                sysfs_data->data.string.variable = NULL;
+        }
+
+        if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
+                toi_cleanup_usm();
+
+        toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME);
+
+        return result;
+}
+
+static struct sysfs_ops toi_sysfs_ops = {
+        .show        = &toi_attr_show,
+        .store        = &toi_attr_store,
+};
+
+static struct kobj_type toi_ktype = {
+        .sysfs_ops        = &toi_sysfs_ops,
+};
+
+struct kobject *tuxonice_kobj;
+
+/* Non-module sysfs entries.
+ *
+ * This array contains entries that are automatically registered at
+ * boot. Modules and the console code register their own entries separately.
+ */
+
+static struct toi_sysfs_data sysfs_params[] = {
+        SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL,
+                SYSFS_HIBERNATING, toi_main_wrapper),
+        SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL,
+                SYSFS_RESUMING, toi_try_resume)
+};
+
+void remove_toi_sysdir(struct kobject *kobj)
+{
+        if (!kobj)
+                return;
+
+        kobject_put(kobj);
+}
+
+struct kobject *make_toi_sysdir(char *name)
+{
+        struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj);
+
+        if (!kobj) {
+                printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs "
+                                "dir!\n");
+                return NULL;
+        }
+
+        kobj->ktype = &toi_ktype;
+
+        return kobj;
+}
+
+/* toi_register_sysfs_file
+ *
+ * Helper for registering a new /sysfs/tuxonice entry.
+ */
+
+int toi_register_sysfs_file(
+                struct kobject *kobj,
+                struct toi_sysfs_data *toi_sysfs_data)
+{
+        int result;
+
+        if (!toi_sysfs_initialised)
+                toi_initialise_sysfs();
+
+        result = sysfs_create_file(kobj, &toi_sysfs_data->attr);
+        if (result)
+                printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s "
+                        "returned %d.\n",
+                        toi_sysfs_data->attr.name, result);
+        kobj->ktype = &toi_ktype;
+
+        return result;
+}
+
+/* toi_unregister_sysfs_file
+ *
+ * Helper for removing unwanted /sys/power/tuxonice entries.
+ *
+ */
+void toi_unregister_sysfs_file(struct kobject *kobj,
+                struct toi_sysfs_data *toi_sysfs_data)
+{
+        sysfs_remove_file(kobj, &toi_sysfs_data->attr);
+}
+
+void toi_cleanup_sysfs(void)
+{
+        int i,
+            numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
+
+        if (!toi_sysfs_initialised)
+                return;
+
+        for (i = 0; i < numfiles; i++)
+                toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
+
+        kobject_put(tuxonice_kobj);
+        toi_sysfs_initialised = 0;
+}
+
+/* toi_initialise_sysfs
+ *
+ * Initialise the /sysfs/tuxonice directory.
+ */
+
+static void toi_initialise_sysfs(void)
+{
+        int i;
+        int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
+
+        if (toi_sysfs_initialised)
+                return;
+
+        /* Make our TuxOnIce directory a child of /sys/power */
+        tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj);
+        if (!tuxonice_kobj)
+                return;
+
+        toi_sysfs_initialised = 1;
+
+        for (i = 0; i < numfiles; i++)
+                toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
+}
+
+int toi_sysfs_init(void)
+{
+        toi_initialise_sysfs();
+        return 0;
+}
+
+void toi_sysfs_exit(void)
+{
+        toi_cleanup_sysfs();
+}
diff --git b/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h
new file mode 100644
index 0000000..1de954c
--- /dev/null
+++ b/kernel/power/tuxonice_sysfs.h
@@ -0,0 +1,137 @@
+/*
+ * kernel/power/tuxonice_sysfs.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/sysfs.h>
+
+struct toi_sysfs_data {
+        struct attribute attr;
+        int type;
+        int flags;
+        union {
+                struct {
+                        unsigned long *bit_vector;
+                        int bit;
+                } bit;
+                struct {
+                        int *variable;
+                        int minimum;
+                        int maximum;
+                } integer;
+                struct {
+                        long *variable;
+                        long minimum;
+                        long maximum;
+                } a_long;
+                struct {
+                        unsigned long *variable;
+                        unsigned long minimum;
+                        unsigned long maximum;
+                } ul;
+                struct {
+                        char *variable;
+                        int max_length;
+                } string;
+                struct {
+                        int (*read_sysfs) (const char *buffer, int count);
+                        int (*write_sysfs) (const char *buffer, int count);
+                        void *data;
+                } special;
+        } data;
+
+        /* Side effects routine. Used, eg, for reparsing the
+         * resume= entry when it changes */
+        void (*write_side_effect) (void);
+        struct list_head sysfs_data_list;
+};
+
+enum {
+        TOI_SYSFS_DATA_NONE = 1,
+        TOI_SYSFS_DATA_CUSTOM,
+        TOI_SYSFS_DATA_BIT,
+        TOI_SYSFS_DATA_INTEGER,
+        TOI_SYSFS_DATA_UL,
+        TOI_SYSFS_DATA_LONG,
+        TOI_SYSFS_DATA_STRING
+};
+
+#define SYSFS_WRITEONLY 0200
+#define SYSFS_READONLY 0444
+#define SYSFS_RW 0644
+
+#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \
+        .attr = {.name  = _name , .mode   = _mode }, \
+        .type = TOI_SYSFS_DATA_BIT, \
+        .flags = _flags, \
+        .data = { .bit = { .bit_vector = _ul, .bit = _bit } } }
+
+#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \
+        .attr = {.name  = _name , .mode   = _mode }, \
+        .type = TOI_SYSFS_DATA_INTEGER, \
+        .flags = _flags, \
+        .data = { .integer = { .variable = _int, .minimum = _min, \
+                        .maximum = _max } }, \
+        .write_side_effect = _wse }
+
+#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \
+        .attr = {.name  = _name , .mode   = _mode }, \
+        .type = TOI_SYSFS_DATA_UL, \
+        .flags = _flags, \
+        .data = { .ul = { .variable = _ul, .minimum = _min, \
+                        .maximum = _max } } }
+
+#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \
+        .attr = {.name  = _name , .mode   = _mode }, \
+        .type = TOI_SYSFS_DATA_LONG, \
+        .flags = _flags, \
+        .data = { .a_long = { .variable = _long, .minimum = _min, \
+                        .maximum = _max } } }
+
+#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \
+        .attr = {.name  = _name , .mode   = _mode }, \
+        .type = TOI_SYSFS_DATA_STRING, \
+        .flags = _flags, \
+        .data = { .string = { .variable = _string, .max_length = _max_len } }, \
+        .write_side_effect = _wse }
+
+#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \
+        .attr = {.name  = _name , .mode   = _mode }, \
+        .type = TOI_SYSFS_DATA_CUSTOM, \
+        .flags = _flags, \
+        .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \
+        .write_side_effect = _wse }
+
+#define SYSFS_NONE(_name, _wse) { \
+        .attr = {.name  = _name , .mode   = SYSFS_WRITEONLY }, \
+        .type = TOI_SYSFS_DATA_NONE, \
+        .write_side_effect = _wse, \
+}
+
+/* Flags */
+#define SYSFS_NEEDS_SM_FOR_READ 1
+#define SYSFS_NEEDS_SM_FOR_WRITE 2
+#define SYSFS_HIBERNATE 4
+#define SYSFS_RESUME 8
+#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME)
+#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE)
+#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE)
+#define SYSFS_NEEDS_SM_FOR_BOTH \
+ (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE)
+
+int toi_register_sysfs_file(struct kobject *kobj,
+                struct toi_sysfs_data *toi_sysfs_data);
+void toi_unregister_sysfs_file(struct kobject *kobj,
+                struct toi_sysfs_data *toi_sysfs_data);
+
+extern struct kobject *tuxonice_kobj;
+
+struct kobject *make_toi_sysdir(char *name);
+void remove_toi_sysdir(struct kobject *obj);
+extern void toi_cleanup_sysfs(void);
+
+extern int toi_sysfs_init(void);
+extern void toi_sysfs_exit(void);
diff --git b/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c
new file mode 100644
index 0000000..76152f3
--- /dev/null
+++ b/kernel/power/tuxonice_ui.c
@@ -0,0 +1,247 @@
+/*
+ * kernel/power/tuxonice_ui.c
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for TuxOnIce's user interface.
+ *
+ * The user interface code talks to a userspace program via a
+ * netlink socket.
+ *
+ * The kernel side:
+ * - starts the userui program;
+ * - sends text messages and progress bar status;
+ *
+ * The user space side:
+ * - passes messages regarding user requests (abort, toggle reboot etc)
+ *
+ */
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/reboot.h>
+
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_netlink.h"
+#include "tuxonice_power_off.h"
+#include "tuxonice_builtin.h"
+
+static char local_printf_buf[1024];        /* Same as printk - should be safe */
+struct ui_ops *toi_current_ui;
+
+/**
+ * toi_wait_for_keypress - Wait for keypress via userui or /dev/console.
+ *
+ * @timeout: Maximum time to wait.
+ *
+ * Wait for a keypress, either from userui or /dev/console if userui isn't
+ * available. The non-userui path is particularly for at boot-time, prior
+ * to userui being started, when we have an important warning to give to
+ * the user.
+ */
+static char toi_wait_for_keypress(int timeout)
+{
+        if (toi_current_ui && toi_current_ui->wait_for_key(timeout))
+                return ' ';
+
+        return toi_wait_for_keypress_dev_console(timeout);
+}
+
+/* toi_early_boot_message()
+ * Description:        Handle errors early in the process of booting.
+ *                 The user may press C to continue booting, perhaps
+ *                 invalidating the image,  or space to reboot.
+ *                 This works from either the serial console or normally
+ *                 attached keyboard.
+ *
+ *                 Note that we come in here from init, while the kernel is
+ *                 locked. If we want to get events from the serial console,
+ *                 we need to temporarily unlock the kernel.
+ *
+ *                 toi_early_boot_message may also be called post-boot.
+ *                 In this case, it simply printks the message and returns.
+ *
+ * Arguments:        int        Whether we are able to erase the image.
+ *                 int        default_answer. What to do when we timeout. This
+ *                         will normally be continue, but the user might
+ *                         provide command line options (__setup) to override
+ *                         particular cases.
+ *                 Char *. Pointer to a string explaining why we're moaning.
+ */
+
+#define say(message, a...) printk(KERN_EMERG message, ##a)
+
+void toi_early_boot_message(int message_detail, int default_answer,
+        char *warning_reason, ...)
+{
+#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
+        unsigned long orig_state = get_toi_state(), continue_req = 0;
+        unsigned long orig_loglevel = console_loglevel;
+        int can_ask = 1;
+#else
+        int can_ask = 0;
+#endif
+
+        va_list args;
+        int printed_len;
+
+        if (!toi_wait) {
+                set_toi_state(TOI_CONTINUE_REQ);
+                can_ask = 0;
+        }
+
+        if (warning_reason) {
+                va_start(args, warning_reason);
+                printed_len = vsnprintf(local_printf_buf,
+                                sizeof(local_printf_buf),
+                                warning_reason,
+                                args);
+                va_end(args);
+        }
+
+        if (!test_toi_state(TOI_BOOT_TIME)) {
+                printk("TuxOnIce: %s\n", local_printf_buf);
+                return;
+        }
+
+        if (!can_ask) {
+                continue_req = !!default_answer;
+                goto post_ask;
+        }
+
+#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
+        console_loglevel = 7;
+
+        say("=== TuxOnIce ===\n\n");
+        if (warning_reason) {
+                say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
+                switch (message_detail) {
+                case 0:
+                        say("If you continue booting, note that any image WILL"
+                                "NOT BE REMOVED.\nTuxOnIce is unable to do so "
+                                "because the appropriate modules aren't\n"
+                                "loaded. You should manually remove the image "
+                                "to avoid any\npossibility of corrupting your "
+                                "filesystem(s) later.\n");
+                        break;
+                case 1:
+                        say("If you want to use the current TuxOnIce image, "
+                                "reboot and try\nagain with the same kernel "
+                                "that you hibernated from. If you want\n"
+                                "to forget that image, continue and the image "
+                                "will be erased.\n");
+                        break;
+                }
+                say("Press SPACE to reboot or C to continue booting with "
+                        "this kernel\n\n");
+                if (toi_wait > 0)
+                        say("Default action if you don't select one in %d "
+                                "seconds is: %s.\n",
+                                toi_wait,
+                                default_answer == TOI_CONTINUE_REQ ?
+                                "continue booting" : "reboot");
+        } else {
+                say("BIG FAT WARNING!!\n\n"
+                        "You have tried to resume from this image before.\n"
+                        "If it failed once, it may well fail again.\n"
+                        "Would you like to remove the image and boot "
+                        "normally?\nThis will be equivalent to entering "
+                        "noresume on the\nkernel command line.\n\n"
+                        "Press SPACE to remove the image or C to continue "
+                        "resuming.\n\n");
+                if (toi_wait > 0)
+                        say("Default action if you don't select one in %d "
+                                "seconds is: %s.\n", toi_wait,
+                                !!default_answer ?
+                                "continue resuming" : "remove the image");
+        }
+        console_loglevel = orig_loglevel;
+
+        set_toi_state(TOI_SANITY_CHECK_PROMPT);
+        clear_toi_state(TOI_CONTINUE_REQ);
+
+        if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */
+                continue_req = !!default_answer;
+        else
+                continue_req = test_toi_state(TOI_CONTINUE_REQ);
+
+#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */
+
+post_ask:
+        if ((warning_reason) && (!continue_req))
+                kernel_restart(NULL);
+
+        restore_toi_state(orig_state);
+        if (continue_req)
+                set_toi_state(TOI_CONTINUE_REQ);
+}
+
+#undef say
+
+/*
+ * User interface specific /sys/power/tuxonice entries.
+ */
+
+static struct toi_sysfs_data sysfs_params[] = {
+#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
+        SYSFS_INT("default_console_level", SYSFS_RW,
+                        &toi_bkd.toi_default_console_level, 0, 7, 0, NULL),
+        SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0,
+                        1 << 30, 0),
+        SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL,
+                        0)
+#endif
+};
+
+static struct toi_module_ops userui_ops = {
+        .type                                = MISC_HIDDEN_MODULE,
+        .name                                = "printk ui",
+        .directory                        = "user_interface",
+        .module                                = THIS_MODULE,
+        .sysfs_data                        = sysfs_params,
+        .num_sysfs_entries                = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+int toi_register_ui_ops(struct ui_ops *this_ui)
+{
+        if (toi_current_ui) {
+                printk(KERN_INFO "Only one TuxOnIce user interface module can "
+                                "be loaded at a time.");
+                return -EBUSY;
+        }
+
+        toi_current_ui = this_ui;
+
+        return 0;
+}
+
+void toi_remove_ui_ops(struct ui_ops *this_ui)
+{
+        if (toi_current_ui != this_ui)
+                return;
+
+        toi_current_ui = NULL;
+}
+
+/* toi_console_sysfs_init
+ * Description: Boot time initialisation for user interface.
+ */
+
+int toi_ui_init(void)
+{
+        return toi_register_module(&userui_ops);
+}
+
+void toi_ui_exit(void)
+{
+        toi_unregister_module(&userui_ops);
+}
diff --git b/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h
new file mode 100644
index 0000000..4934e3a
--- /dev/null
+++ b/kernel/power/tuxonice_ui.h
@@ -0,0 +1,97 @@
+/*
+ * kernel/power/tuxonice_ui.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ */
+
+enum {
+        DONT_CLEAR_BAR,
+        CLEAR_BAR
+};
+
+enum {
+        /* Userspace -> Kernel */
+        USERUI_MSG_ABORT = 0x11,
+        USERUI_MSG_SET_STATE = 0x12,
+        USERUI_MSG_GET_STATE = 0x13,
+        USERUI_MSG_GET_DEBUG_STATE = 0x14,
+        USERUI_MSG_SET_DEBUG_STATE = 0x15,
+        USERUI_MSG_SPACE = 0x18,
+        USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A,
+        USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B,
+        USERUI_MSG_GET_LOGLEVEL = 0x1C,
+        USERUI_MSG_SET_LOGLEVEL = 0x1D,
+        USERUI_MSG_PRINTK = 0x1E,
+
+        /* Kernel -> Userspace */
+        USERUI_MSG_MESSAGE = 0x21,
+        USERUI_MSG_PROGRESS = 0x22,
+        USERUI_MSG_POST_ATOMIC_RESTORE = 0x25,
+
+        USERUI_MSG_MAX,
+};
+
+struct userui_msg_params {
+        u32 a, b, c, d;
+        char text[255];
+};
+
+struct ui_ops {
+        char (*wait_for_key) (int timeout);
+        u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...);
+        void (*prepare_status) (int clearbar, const char *fmt, ...);
+        void (*cond_pause) (int pause, char *message);
+        void (*abort)(int result_code, const char *fmt, ...);
+        void (*prepare)(void);
+        void (*cleanup)(void);
+        void (*message)(u32 section, u32 level, u32 normally_logged,
+                        const char *fmt, ...);
+};
+
+extern struct ui_ops *toi_current_ui;
+
+#define toi_update_status(val, max, fmt, args...) \
+ (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \
+        max)
+
+#define toi_prepare_console(void) \
+        do { if (toi_current_ui) \
+                (toi_current_ui->prepare)(); \
+        } while (0)
+
+#define toi_cleanup_console(void) \
+        do { if (toi_current_ui) \
+                (toi_current_ui->cleanup)(); \
+        } while (0)
+
+#define abort_hibernate(result, fmt, args...) \
+        do { if (toi_current_ui) \
+                (toi_current_ui->abort)(result, fmt, ##args); \
+             else { \
+                set_abort_result(result); \
+             } \
+        } while (0)
+
+#define toi_cond_pause(pause, message) \
+        do { if (toi_current_ui) \
+                (toi_current_ui->cond_pause)(pause, message); \
+        } while (0)
+
+#define toi_prepare_status(clear, fmt, args...) \
+        do { if (toi_current_ui) \
+                (toi_current_ui->prepare_status)(clear, fmt, ##args); \
+             else \
+                printk(KERN_INFO fmt "%s", ##args, "\n"); \
+        } while (0)
+
+#define toi_message(sn, lev, log, fmt, a...) \
+do { \
+        if (toi_current_ui && (!sn || test_debug_state(sn))) \
+                toi_current_ui->message(sn, lev, log, fmt, ##a); \
+} while (0)
+
+__exit void toi_ui_cleanup(void);
+extern int toi_ui_init(void);
+extern void toi_ui_exit(void);
+extern int toi_register_ui_ops(struct ui_ops *this_ui);
+extern void toi_remove_ui_ops(struct ui_ops *this_ui);
diff --git b/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
new file mode 100644
index 0000000..6aa5ac3
--- /dev/null
+++ b/kernel/power/tuxonice_userui.c
@@ -0,0 +1,658 @@
+/*
+ * kernel/power/user_ui.c
+ *
+ * Copyright (C) 2005-2007 Bernard Blackham
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for TuxOnIce's user interface.
+ *
+ * The user interface code talks to a userspace program via a
+ * netlink socket.
+ *
+ * The kernel side:
+ * - starts the userui program;
+ * - sends text messages and progress bar status;
+ *
+ * The user space side:
+ * - passes messages regarding user requests (abort, toggle reboot etc)
+ *
+ */
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+#include <linux/console.h>
+#include <linux/ctype.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>
+#include <linux/reboot.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/vt.h>
+
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_netlink.h"
+#include "tuxonice_power_off.h"
+
+static char local_printf_buf[1024];        /* Same as printk - should be safe */
+
+static struct user_helper_data ui_helper_data;
+static struct toi_module_ops userui_ops;
+static int orig_kmsg;
+
+static char lastheader[512];
+static int lastheader_message_len;
+static int ui_helper_changed; /* Used at resume-time so don't overwrite value
+                                set from initrd/ramfs. */
+
+/* Number of distinct progress amounts that userspace can display */
+static int progress_granularity = 30;
+
+static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
+static int userui_wait_should_wake;
+
+#define toi_stop_waiting_for_userui_key() \
+{ \
+        userui_wait_should_wake = true; \
+        wake_up_interruptible(&userui_wait_for_key); \
+}
+
+/**
+ * ui_nl_set_state - Update toi_action based on a message from userui.
+ *
+ * @n: The bit (1 << bit) to set.
+ */
+static void ui_nl_set_state(int n)
+{
+        /* Only let them change certain settings */
+        static const u32 toi_action_mask =
+                (1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
+                (1 << TOI_LOGALL) |
+                (1 << TOI_SINGLESTEP) |
+                (1 << TOI_PAUSE_NEAR_PAGESET_END);
+        static unsigned long new_action;
+
+        new_action = (toi_bkd.toi_action & (~toi_action_mask)) |
+                (n & toi_action_mask);
+
+        printk(KERN_DEBUG "n is %x. Action flags being changed from %lx "
+                        "to %lx.", n, toi_bkd.toi_action, new_action);
+        toi_bkd.toi_action = new_action;
+
+        if (!test_action_state(TOI_PAUSE) &&
+                        !test_action_state(TOI_SINGLESTEP))
+                toi_stop_waiting_for_userui_key();
+}
+
+/**
+ * userui_post_atomic_restore - Tell userui that atomic restore just happened.
+ *
+ * Tell userui that atomic restore just occured, so that it can do things like
+ * redrawing the screen, re-getting settings and so on.
+ */
+static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd)
+{
+        toi_send_netlink_message(&ui_helper_data,
+                        USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
+}
+
+/**
+ * userui_storage_needed - Report how much memory in image header is needed.
+ */
+static int userui_storage_needed(void)
+{
+        return sizeof(ui_helper_data.program) + 1 + sizeof(int);
+}
+
+/**
+ * userui_save_config_info - Fill buffer with config info for image header.
+ *
+ * @buf: Buffer into which to put the config info we want to save.
+ */
+static int userui_save_config_info(char *buf)
+{
+        *((int *) buf) = progress_granularity;
+        memcpy(buf + sizeof(int), ui_helper_data.program,
+                        sizeof(ui_helper_data.program));
+        return sizeof(ui_helper_data.program) + sizeof(int) + 1;
+}
+
+/**
+ * userui_load_config_info - Restore config info from buffer.
+ *
+ * @buf: Buffer containing header info loaded.
+ * @size: Size of data loaded for this module.
+ */
+static void userui_load_config_info(char *buf, int size)
+{
+        progress_granularity = *((int *) buf);
+        size -= sizeof(int);
+
+        /* Don't load the saved path if one has already been set */
+        if (ui_helper_changed)
+                return;
+
+        if (size > sizeof(ui_helper_data.program))
+                size = sizeof(ui_helper_data.program);
+
+        memcpy(ui_helper_data.program, buf + sizeof(int), size);
+        ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
+}
+
+/**
+ * set_ui_program_set: Record that userui program was changed.
+ *
+ * Side effect routine for when the userui program is set. In an initrd or
+ * ramfs, the user may set a location for the userui program. If this happens,
+ * we don't want to reload the value that was saved in the image header. This
+ * routine allows us to flag that we shouldn't restore the program name from
+ * the image header.
+ */
+static void set_ui_program_set(void)
+{
+        ui_helper_changed = 1;
+}
+
+/**
+ * userui_memory_needed - Tell core how much memory to reserve for us.
+ */
+static int userui_memory_needed(void)
+{
+        /* ball park figure of 128 pages */
+        return 128 * PAGE_SIZE;
+}
+
+/**
+ * userui_update_status - Update the progress bar and (if on) in-bar message.
+ *
+ * @value: Current progress percentage numerator.
+ * @maximum: Current progress percentage denominator.
+ * @fmt: Message to be displayed in the middle of the progress bar.
+ *
+ * Note that a NULL message does not mean that any previous message is erased!
+ * For that, you need toi_prepare_status with clearbar on.
+ *
+ * Returns an unsigned long, being the next numerator (as determined by the
+ * maximum and progress granularity) where status needs to be updated.
+ * This is to reduce unnecessary calls to update_status.
+ */
+static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...)
+{
+        static u32 last_step = 9999;
+        struct userui_msg_params msg;
+        u32 this_step, next_update;
+        int bitshift;
+
+        if (ui_helper_data.pid == -1)
+                return 0;
+
+        if ((!maximum) || (!progress_granularity))
+                return maximum;
+
+        if (value < 0)
+                value = 0;
+
+        if (value > maximum)
+                value = maximum;
+
+        /* Try to avoid math problems - we can't do 64 bit math here
+         * (and shouldn't need it - anyone got screen resolution
+         * of 65536 pixels or more?) */
+        bitshift = fls(maximum) - 16;
+        if (bitshift > 0) {
+                u32 temp_maximum = maximum >> bitshift;
+                u32 temp_value = value >> bitshift;
+                this_step = (u32)
+                        (temp_value * progress_granularity / temp_maximum);
+                next_update = (((this_step + 1) * temp_maximum /
+                                        progress_granularity) + 1) << bitshift;
+        } else {
+                this_step = (u32) (value * progress_granularity / maximum);
+                next_update = ((this_step + 1) * maximum /
+                                progress_granularity) + 1;
+        }
+
+        if (this_step == last_step)
+                return next_update;
+
+        memset(&msg, 0, sizeof(msg));
+
+        msg.a = this_step;
+        msg.b = progress_granularity;
+
+        if (fmt) {
+                va_list args;
+                va_start(args, fmt);
+                vsnprintf(msg.text, sizeof(msg.text), fmt, args);
+                va_end(args);
+                msg.text[sizeof(msg.text)-1] = '\0';
+        }
+
+        toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
+                        &msg, sizeof(msg));
+        last_step = this_step;
+
+        return next_update;
+}
+
+/**
+ * userui_message - Display a message without necessarily logging it.
+ *
+ * @section: Type of message. Messages can be filtered by type.
+ * @level: Degree of importance of the message. Lower values = higher priority.
+ * @normally_logged: Whether logged even if log_everything is off.
+ * @fmt: Message (and parameters).
+ *
+ * This function is intended to do the same job as printk, but without normally
+ * logging what is printed. The point is to be able to get debugging info on
+ * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
+ *
+ * It may be called from an interrupt context - can't sleep!
+ */
+static void userui_message(u32 section, u32 level, u32 normally_logged,
+                const char *fmt, ...)
+{
+        struct userui_msg_params msg;
+
+        if ((level) && (level > console_loglevel))
+                return;
+
+        memset(&msg, 0, sizeof(msg));
+
+        msg.a = section;
+        msg.b = level;
+        msg.c = normally_logged;
+
+        if (fmt) {
+                va_list args;
+                va_start(args, fmt);
+                vsnprintf(msg.text, sizeof(msg.text), fmt, args);
+                va_end(args);
+                msg.text[sizeof(msg.text)-1] = '\0';
+        }
+
+        if (test_action_state(TOI_LOGALL))
+                printk(KERN_INFO "%s\n", msg.text);
+
+        toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
+                        &msg, sizeof(msg));
+}
+
+/**
+ * wait_for_key_via_userui - Wait for userui to receive a keypress.
+ */
+static void wait_for_key_via_userui(void)
+{
+        DECLARE_WAITQUEUE(wait, current);
+
+        add_wait_queue(&userui_wait_for_key, &wait);
+        set_current_state(TASK_INTERRUPTIBLE);
+
+        wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake);
+        userui_wait_should_wake = false;
+
+        set_current_state(TASK_RUNNING);
+        remove_wait_queue(&userui_wait_for_key, &wait);
+}
+
+/**
+ * userui_prepare_status - Display high level messages.
+ *
+ * @clearbar: Whether to clear the progress bar.
+ * @fmt...: New message for the title.
+ *
+ * Prepare the 'nice display', drawing the header and version, along with the
+ * current action and perhaps also resetting the progress bar.
+ */
+static void userui_prepare_status(int clearbar, const char *fmt, ...)
+{
+        va_list args;
+
+        if (fmt) {
+                va_start(args, fmt);
+                lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
+                va_end(args);
+        }
+
+        if (clearbar)
+                toi_update_status(0, 1, NULL);
+
+        if (ui_helper_data.pid == -1)
+                printk(KERN_EMERG "%s\n", lastheader);
+        else
+                toi_message(0, TOI_STATUS, 1, lastheader, NULL);
+}
+
+/**
+ * toi_wait_for_keypress - Wait for keypress via userui.
+ *
+ * @timeout: Maximum time to wait.
+ *
+ * Wait for a keypress from userui.
+ *
+ * FIXME: Implement timeout?
+ */
+static char userui_wait_for_keypress(int timeout)
+{
+        char key = '\0';
+
+        if (ui_helper_data.pid != -1) {
+                wait_for_key_via_userui();
+                key = ' ';
+        }
+
+        return key;
+}
+
+/**
+ * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
+ *
+ * @result_code: Reason why we're aborting (1 << bit).
+ * @fmt: Message to display if telling the user what's going on.
+ *
+ * Abort a cycle. If this wasn't at the user's request (and we're displaying
+ * output), tell the user why and wait for them to acknowledge the message.
+ */
+static void userui_abort_hibernate(int result_code, const char *fmt, ...)
+{
+        va_list args;
+        int printed_len = 0;
+
+        set_result_state(result_code);
+
+        if (test_result_state(TOI_ABORTED))
+                return;
+
+        set_result_state(TOI_ABORTED);
+
+        if (test_result_state(TOI_ABORT_REQUESTED))
+                return;
+
+        va_start(args, fmt);
+        printed_len = vsnprintf(local_printf_buf,  sizeof(local_printf_buf),
+                        fmt, args);
+        va_end(args);
+        if (ui_helper_data.pid != -1)
+                printed_len = sprintf(local_printf_buf + printed_len,
+                                        " (Press SPACE to continue)");
+
+        toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
+
+        if (ui_helper_data.pid != -1)
+                userui_wait_for_keypress(0);
+}
+
+/**
+ * request_abort_hibernate - Abort hibernating or resuming at user request.
+ *
+ * Handle the user requesting the cancellation of a hibernation or resume by
+ * pressing escape.
+ */
+static void request_abort_hibernate(void)
+{
+        if (test_result_state(TOI_ABORT_REQUESTED) ||
+           !test_action_state(TOI_CAN_CANCEL))
+                return;
+
+        if (test_toi_state(TOI_NOW_RESUMING)) {
+                toi_prepare_status(CLEAR_BAR, "Escape pressed. "
+                                        "Powering down again.");
+                set_toi_state(TOI_STOP_RESUME);
+                while (!test_toi_state(TOI_IO_STOPPED))
+                        schedule();
+                if (toiActiveAllocator->mark_resume_attempted)
+                        toiActiveAllocator->mark_resume_attempted(0);
+                toi_power_down();
+        }
+
+        toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
+                                        " ABORTING HIBERNATION ---");
+        set_abort_result(TOI_ABORT_REQUESTED);
+        toi_stop_waiting_for_userui_key();
+}
+
+/**
+ * userui_user_rcv_msg - Receive a netlink message from userui.
+ *
+ * @skb: skb received.
+ * @nlh: Netlink header received.
+ */
+static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+        int type;
+        int *data;
+
+        type = nlh->nlmsg_type;
+
+        /* A control message: ignore them */
+        if (type < NETLINK_MSG_BASE)
+                return 0;
+
+        /* Unknown message: reply with EINVAL */
+        if (type >= USERUI_MSG_MAX)
+                return -EINVAL;
+
+        /* All operations require privileges, even GET */
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+
+        /* Only allow one task to receive NOFREEZE privileges */
+        if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
+                printk(KERN_INFO "Got NOFREEZE_ME request when "
+                        "ui_helper_data.pid is %d.\n", ui_helper_data.pid);
+                return -EBUSY;
+        }
+
+        data = (int *) NLMSG_DATA(nlh);
+
+        switch (type) {
+        case USERUI_MSG_ABORT:
+                request_abort_hibernate();
+                return 0;
+        case USERUI_MSG_GET_STATE:
+                toi_send_netlink_message(&ui_helper_data,
+                                USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
+                                sizeof(toi_bkd.toi_action));
+                return 0;
+        case USERUI_MSG_GET_DEBUG_STATE:
+                toi_send_netlink_message(&ui_helper_data,
+                                USERUI_MSG_GET_DEBUG_STATE,
+                                &toi_bkd.toi_debug_state,
+                                sizeof(toi_bkd.toi_debug_state));
+                return 0;
+        case USERUI_MSG_SET_STATE:
+                if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+                        return -EINVAL;
+                ui_nl_set_state(*data);
+                return 0;
+        case USERUI_MSG_SET_DEBUG_STATE:
+                if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+                        return -EINVAL;
+                toi_bkd.toi_debug_state = (*data);
+                return 0;
+        case USERUI_MSG_SPACE:
+                toi_stop_waiting_for_userui_key();
+                return 0;
+        case USERUI_MSG_GET_POWERDOWN_METHOD:
+                toi_send_netlink_message(&ui_helper_data,
+                                USERUI_MSG_GET_POWERDOWN_METHOD,
+                                &toi_poweroff_method,
+                                sizeof(toi_poweroff_method));
+                return 0;
+        case USERUI_MSG_SET_POWERDOWN_METHOD:
+                if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char)))
+                        return -EINVAL;
+                toi_poweroff_method = (unsigned long)(*data);
+                return 0;
+        case USERUI_MSG_GET_LOGLEVEL:
+                toi_send_netlink_message(&ui_helper_data,
+                                USERUI_MSG_GET_LOGLEVEL,
+                                &toi_bkd.toi_default_console_level,
+                                sizeof(toi_bkd.toi_default_console_level));
+                return 0;
+        case USERUI_MSG_SET_LOGLEVEL:
+                if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+                        return -EINVAL;
+                toi_bkd.toi_default_console_level = (*data);
+                return 0;
+        case USERUI_MSG_PRINTK:
+                printk(KERN_INFO "%s", (char *) data);
+                return 0;
+        }
+
+        /* Unhandled here */
+        return 1;
+}
+
+/**
+ * userui_cond_pause - Possibly pause at user request.
+ *
+ * @pause: Whether to pause or just display the message.
+ * @message: Message to display at the start of pausing.
+ *
+ * Potentially pause and wait for the user to tell us to continue. We normally
+ * only pause when @pause is set. While paused, the user can do things like
+ * changing the loglevel, toggling the display of debugging sections and such
+ * like.
+ */
+static void userui_cond_pause(int pause, char *message)
+{
+        int displayed_message = 0, last_key = 0;
+
+        while (last_key != 32 &&
+                ui_helper_data.pid != -1 &&
+                ((test_action_state(TOI_PAUSE) && pause) ||
+                 (test_action_state(TOI_SINGLESTEP)))) {
+                if (!displayed_message) {
+                        toi_prepare_status(DONT_CLEAR_BAR,
+                           "%s Press SPACE to continue.%s",
+                           message ? message : "",
+                           (test_action_state(TOI_SINGLESTEP)) ?
+                           " Single step on." : "");
+                        displayed_message = 1;
+                }
+                last_key = userui_wait_for_keypress(0);
+        }
+        schedule();
+}
+
+/**
+ * userui_prepare_console - Prepare the console for use.
+ *
+ * Prepare a console for use, saving current kmsg settings and attempting to
+ * start userui. Console loglevel changes are handled by userui.
+ */
+static void userui_prepare_console(void)
+{
+        orig_kmsg = vt_kmsg_redirect(fg_console + 1);
+
+        ui_helper_data.pid = -1;
+
+        if (!userui_ops.enabled) {
+                printk(KERN_INFO "TuxOnIce: Userui disabled.\n");
+                return;
+        }
+
+        if (*ui_helper_data.program)
+                toi_netlink_setup(&ui_helper_data);
+        else
+                printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
+}
+
+/**
+ * userui_cleanup_console - Cleanup after a cycle.
+ *
+ * Tell userui to cleanup, and restore kmsg_redirect to its original value.
+ */
+
+static void userui_cleanup_console(void)
+{
+        if (ui_helper_data.pid > -1)
+                toi_netlink_close(&ui_helper_data);
+
+        vt_kmsg_redirect(orig_kmsg);
+}
+
+/*
+ * User interface specific /sys/power/tuxonice entries.
+ */
+
+static struct toi_sysfs_data sysfs_params[] = {
+#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
+        SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_CAN_CANCEL, 0),
+        SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action,
+                        TOI_PAUSE, 0),
+        SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL),
+        SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
+                        2048, 0, NULL),
+        SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0,
+                        set_ui_program_set),
+        SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL)
+#endif
+};
+
+static struct toi_module_ops userui_ops = {
+        .type                                = MISC_MODULE,
+        .name                                = "userui",
+        .shared_directory                = "user_interface",
+        .module                                = THIS_MODULE,
+        .storage_needed                        = userui_storage_needed,
+        .save_config_info                = userui_save_config_info,
+        .load_config_info                = userui_load_config_info,
+        .memory_needed                        = userui_memory_needed,
+        .post_atomic_restore                = userui_post_atomic_restore,
+        .sysfs_data                        = sysfs_params,
+        .num_sysfs_entries                = sizeof(sysfs_params) /
+                sizeof(struct toi_sysfs_data),
+};
+
+static struct ui_ops my_ui_ops = {
+        .update_status                        = userui_update_status,
+        .message                        = userui_message,
+        .prepare_status                        = userui_prepare_status,
+        .abort                                = userui_abort_hibernate,
+        .cond_pause                        = userui_cond_pause,
+        .prepare                        = userui_prepare_console,
+        .cleanup                        = userui_cleanup_console,
+        .wait_for_key                        = userui_wait_for_keypress,
+};
+
+/**
+ * toi_user_ui_init - Boot time initialisation for user interface.
+ *
+ * Invoked from the core init routine.
+ */
+static __init int toi_user_ui_init(void)
+{
+        int result;
+
+        ui_helper_data.nl = NULL;
+        strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
+        ui_helper_data.pid = -1;
+        ui_helper_data.skb_size = sizeof(struct userui_msg_params);
+        ui_helper_data.pool_limit = 6;
+        ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
+        ui_helper_data.name = "userspace ui";
+        ui_helper_data.rcv_msg = userui_user_rcv_msg;
+        ui_helper_data.interface_version = 8;
+        ui_helper_data.must_init = 0;
+        ui_helper_data.not_ready = userui_cleanup_console;
+        init_completion(&ui_helper_data.wait_for_process);
+        result = toi_register_module(&userui_ops);
+        if (!result) {
+          result = toi_register_ui_ops(&my_ui_ops);
+          if (result)
+            toi_unregister_module(&userui_ops);
+        }
+
+        return result;
+}
+
+late_initcall(toi_user_ui_init);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c963ba5..464c10a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -33,6 +33,7 @@
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/syscalls.h>
+#include <linux/suspend.h>
 #include <linux/kexec.h>
 #include <linux/kdb.h>
 #include <linux/ratelimit.h>
@@ -284,6 +285,20 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
 static char *log_buf = __log_buf;
 static u32 log_buf_len = __LOG_BUF_LEN;
 
+#ifdef CONFIG_TOI_INCREMENTAL
+void toi_set_logbuf_untracked(void)
+{
+    int i;
+    struct page *log_buf_start_page = virt_to_page(__log_buf);
+
+    printk("Not protecting kernel printk log buffer (%p-%p).\n",
+            __log_buf, __log_buf + __LOG_BUF_LEN);
+
+    for (i = 0; i < (1 << (CONFIG_LOG_BUF_SHIFT - PAGE_SHIFT)); i++)
+        SetPageTOI_Untracked(log_buf_start_page + i);
+}
+#endif
+
 /* Return log buffer address */
 char *log_buf_addr_get(void)
 {
@@ -1455,9 +1470,9 @@ static void call_console_drivers(int level,
 		    !(con->flags & CON_ANYTIME))
 			continue;
 		if (con->flags & CON_EXTENDED)
-			con->write(con, ext_text, ext_len);
+			con->write(con, ext_text, ext_len, level);
 		else
-			con->write(con, text, len);
+			con->write(con, text, len, level);
 	}
 }
 
@@ -1978,7 +1993,7 @@ asmlinkage __visible void early_printk(const char *fmt, ...)
 	n = vscnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 
-	early_console->write(early_console, buf, n);
+	early_console->write(early_console, buf, n, 0);
 }
 #endif
 
diff --git a/kernel/profile.c b/kernel/profile.c
index 99513e1..5136969 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,6 +59,7 @@ int profile_setup(char *str)
 
 	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
 #ifdef CONFIG_SCHEDSTATS
+		force_schedstat_enabled();
 		prof_on = SLEEP_PROFILING;
 		if (str[strlen(sleepstr)] == ',')
 			str += strlen(sleepstr) + 1;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e41dd41..9fd5b62 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1614,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 	int needmore;
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 
-	rcu_nocb_gp_cleanup(rsp, rnp);
 	rnp->need_future_gp[c & 0x1] = 0;
 	needmore = rnp->need_future_gp[(c + 1) & 0x1];
 	trace_rcu_future_gp(rnp, rdp, c,
@@ -1635,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 	    !READ_ONCE(rsp->gp_flags) ||
 	    !rsp->gp_kthread)
 		return;
-	wake_up(&rsp->gp_wq);
+	swake_up(&rsp->gp_wq);
 }
 
 /*
@@ -2010,6 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	int nocb = 0;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct swait_queue_head *sq;
 
 	WRITE_ONCE(rsp->gp_activity, jiffies);
 	raw_spin_lock_irq_rcu_node(rnp);
@@ -2046,7 +2046,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 			needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
 		/* smp_mb() provided by prior unlock-lock pair. */
 		nocb += rcu_future_gp_cleanup(rsp, rnp);
+		sq = rcu_nocb_gp_get(rnp);
 		raw_spin_unlock_irq(&rnp->lock);
+		rcu_nocb_gp_cleanup(sq);
 		cond_resched_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 		rcu_gp_slow(rsp, gp_cleanup_delay);
@@ -2092,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwait"));
 			rsp->gp_state = RCU_GP_WAIT_GPS;
-			wait_event_interruptible(rsp->gp_wq,
+			swait_event_interruptible(rsp->gp_wq,
 						 READ_ONCE(rsp->gp_flags) &
 						 RCU_GP_FLAG_INIT);
 			rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2122,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
 			rsp->gp_state = RCU_GP_WAIT_FQS;
-			ret = wait_event_interruptible_timeout(rsp->gp_wq,
+			ret = swait_event_interruptible_timeout(rsp->gp_wq,
 					rcu_gp_fqs_check_wake(rsp, &gf), j);
 			rsp->gp_state = RCU_GP_DOING_FQS;
 			/* Locking provides needed memory barriers. */
@@ -2246,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-	rcu_gp_kthread_wake(rsp);
+	swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
 }
 
 /*
@@ -2900,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	}
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
-	rcu_gp_kthread_wake(rsp);
+	swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
 }
 
 /*
@@ -3529,7 +3531,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			if (wake) {
 				smp_mb(); /* EGP done before wake_up(). */
-				wake_up(&rsp->expedited_wq);
+				swake_up(&rsp->expedited_wq);
 			}
 			break;
 		}
@@ -3780,7 +3782,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	jiffies_start = jiffies;
 
 	for (;;) {
-		ret = wait_event_interruptible_timeout(
+		ret = swait_event_timeout(
 				rsp->expedited_wq,
 				sync_rcu_preempt_exp_done(rnp_root),
 				jiffies_stall);
@@ -3788,7 +3790,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 			return;
 		if (ret < 0) {
 			/* Hit a signal, disable CPU stall warnings. */
-			wait_event(rsp->expedited_wq,
+			swait_event(rsp->expedited_wq,
 				   sync_rcu_preempt_exp_done(rnp_root));
 			return;
 		}
@@ -4482,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 		}
 	}
 
-	init_waitqueue_head(&rsp->gp_wq);
-	init_waitqueue_head(&rsp->expedited_wq);
+	init_swait_queue_head(&rsp->gp_wq);
+	init_swait_queue_head(&rsp->expedited_wq);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 83360b4..bbd235d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
+#include <linux/swait.h>
 #include <linux/stop_machine.h>
 
 /*
@@ -243,7 +244,7 @@ struct rcu_node {
 				/* Refused to boost: not sure why, though. */
 				/*  This can happen due to race conditions. */
 #ifdef CONFIG_RCU_NOCB_CPU
-	wait_queue_head_t nocb_gp_wq[2];
+	struct swait_queue_head nocb_gp_wq[2];
 				/* Place for rcu_nocb_kthread() to wait GP. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 	int need_future_gp[2];
@@ -399,7 +400,7 @@ struct rcu_data {
 	atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
 	struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
 	struct rcu_head **nocb_follower_tail;
-	wait_queue_head_t nocb_wq;	/* For nocb kthreads to sleep on. */
+	struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
 	struct task_struct *nocb_kthread;
 	int nocb_defer_wakeup;		/* Defer wakeup of nocb_kthread. */
 
@@ -478,7 +479,7 @@ struct rcu_state {
 	unsigned long gpnum;			/* Current gp number. */
 	unsigned long completed;		/* # of last completed gp. */
 	struct task_struct *gp_kthread;		/* Task for grace periods. */
-	wait_queue_head_t gp_wq;		/* Where GP task waits. */
+	struct swait_queue_head gp_wq;		/* Where GP task waits. */
 	short gp_flags;				/* Commands for GP task. */
 	short gp_state;				/* GP kthread sleep state. */
 
@@ -506,7 +507,7 @@ struct rcu_state {
 	unsigned long expedited_sequence;	/* Take a ticket. */
 	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
-	wait_queue_head_t expedited_wq;		/* Wait for check-ins. */
+	struct swait_queue_head expedited_wq;	/* Wait for check-ins. */
 	int ncpus_snap;				/* # CPUs seen last time. */
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
@@ -621,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 			    bool lazy, unsigned long flags);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 9467a8b..080bd20 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
  * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
  * grace period.
  */
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
-	wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+	swake_up_all(sq);
 }
 
 /*
@@ -1829,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
 	rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
 }
 
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+	return &rnp->nocb_gp_wq[rnp->completed & 0x1];
+}
+
 static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
-	init_waitqueue_head(&rnp->nocb_gp_wq[0]);
-	init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+	init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+	init_swait_queue_head(&rnp->nocb_gp_wq[1]);
 }
 
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1857,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 	if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 		/* Prior smp_mb__after_atomic() orders against prior enqueue. */
 		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
-		wake_up(&rdp_leader->nocb_wq);
+		swake_up(&rdp_leader->nocb_wq);
 	}
 }
 
@@ -2069,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	 */
 	trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
 	for (;;) {
-		wait_event_interruptible(
+		swait_event_interruptible(
 			rnp->nocb_gp_wq[c & 0x1],
 			(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
 		if (likely(d))
@@ -2097,7 +2102,7 @@ wait_again:
 	/* Wait for callbacks to appear. */
 	if (!rcu_nocb_poll) {
 		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
-		wait_event_interruptible(my_rdp->nocb_wq,
+		swait_event_interruptible(my_rdp->nocb_wq,
 				!READ_ONCE(my_rdp->nocb_leader_sleep));
 		/* Memory barrier handled by smp_mb() calls below and repoll. */
 	} else if (firsttime) {
@@ -2172,7 +2177,7 @@ wait_again:
 			 * List was empty, wake up the follower.
 			 * Memory barriers supplied by atomic_long_add().
 			 */
-			wake_up(&rdp->nocb_wq);
+			swake_up(&rdp->nocb_wq);
 		}
 	}
 
@@ -2193,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 		if (!rcu_nocb_poll) {
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    "FollowerSleep");
-			wait_event_interruptible(rdp->nocb_wq,
+			swait_event_interruptible(rdp->nocb_wq,
 						 READ_ONCE(rdp->nocb_follower_head));
 		} else if (firsttime) {
 			/* Don't drown trace log with "Poll"! */
@@ -2352,7 +2357,7 @@ void __init rcu_init_nohz(void)
 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
 	rdp->nocb_tail = &rdp->nocb_head;
-	init_waitqueue_head(&rdp->nocb_wq);
+	init_swait_queue_head(&rdp->nocb_wq);
 	rdp->nocb_follower_tail = &rdp->nocb_follower_head;
 }
 
@@ -2502,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 	return false;
 }
 
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
 }
 
@@ -2510,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
 {
 }
 
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+	return NULL;
+}
+
 static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
 }
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 6768797..15e3988 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,11 +11,16 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
+ifdef CONFIG_SCHED_BFS
+obj-y += bfs.o clock.o
+else
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+endif
+obj-y += wait.o swait.o completion.o idle.o
+obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git b/kernel/sched/bfs.c b/kernel/sched/bfs.c
new file mode 100644
index 0000000..698007a
--- /dev/null
+++ b/kernel/sched/bfs.c
@@ -0,0 +1,7609 @@
+/*
+ *  kernel/sched/bfs.c, was kernel/sched.c
+ *
+ *  Kernel scheduler and related syscalls
+ *
+ *  Copyright (C) 1991-2002  Linus Torvalds
+ *
+ *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
+ *		make semaphores SMP safe
+ *  1998-11-19	Implemented schedule_timeout() and related stuff
+ *		by Andrea Arcangeli
+ *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Cleanups and useful suggestions
+ *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+ *  2003-09-03	Interactivity tuning by Con Kolivas.
+ *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2007-04-15  Work begun on replacing all interactivity tuning with a
+ *              fair scheduling design by Con Kolivas.
+ *  2007-05-05  Load balancing (smp-nice) and other improvements
+ *              by Peter Williams
+ *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
+ *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ *              Thomas Gleixner, Mike Kravetz
+ *  now		Brainfuck deadline scheduling policy by Con Kolivas deletes
+ *              a whole lot of those previous things.
+ */
+
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+#include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/perf_event.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/freezer.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/sched/sysctl.h>
+#include <linux/times.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <linux/log2.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <linux/context_tracking.h>
+#include <linux/sched/prio.h>
+#include <linux/tick.h>
+
+#include <asm/irq_regs.h>
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+#include <asm/unistd.h>
+#include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
+#include "cpupri.h"
+#include "../workqueue_internal.h"
+#include "../smpboot.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
+#include "bfs_sched.h"
+
+#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p)		rt_prio((p)->prio)
+#define rt_queue(rq)		rt_prio((rq)->rq_prio)
+#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+#define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
+					(policy) == SCHED_RR)
+#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
+
+#define is_idle_policy(policy)	((policy) == SCHED_IDLEPRIO)
+#define idleprio_task(p)	unlikely(is_idle_policy((p)->policy))
+#define task_running_idle(p)	unlikely((p)->prio == IDLE_PRIO)
+#define idle_queue(rq)		(unlikely(is_idle_policy((rq)->rq_policy)))
+
+#define is_iso_policy(policy)	((policy) == SCHED_ISO)
+#define iso_task(p)		unlikely(is_iso_policy((p)->policy))
+#define iso_queue(rq)		unlikely(is_iso_policy((rq)->rq_policy))
+#define task_running_iso(p)	unlikely((p)->prio == ISO_PRIO)
+#define rq_running_iso(rq)	((rq)->rq_prio == ISO_PRIO)
+
+#define rq_idle(rq)		((rq)->rq_prio == PRIO_LIMIT)
+
+#define ISO_PERIOD		((5 * HZ * grq.noc) + 1)
+
+#define SCHED_PRIO(p)		((p) + MAX_RT_PRIO)
+#define STOP_PRIO		(MAX_RT_PRIO - 1)
+
+/*
+ * Some helpers for converting to/from various scales. Use shifts to get
+ * approximate multiples of ten for less overhead.
+ */
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define JIFFY_NS		(1000000000 / HZ)
+#define HALF_JIFFY_NS		(1000000000 / HZ / 2)
+#define HALF_JIFFY_US		(1000000 / HZ / 2)
+#define MS_TO_NS(TIME)		((TIME) << 20)
+#define MS_TO_US(TIME)		((TIME) << 10)
+#define NS_TO_MS(TIME)		((TIME) >> 20)
+#define NS_TO_US(TIME)		((TIME) >> 10)
+
+#define RESCHED_US	(100) /* Reschedule if less than this many Î¼s left */
+
+void print_scheduler_version(void)
+{
+	printk(KERN_INFO "BFS CPU scheduler v0.469 by Con Kolivas.\n");
+}
+
+/*
+ * This is the time all tasks within the same priority round robin.
+ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
+ * Tunable via /proc interface.
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+int rr_interval __read_mostly = 3;
+#else
+int rr_interval __read_mostly = 6;
+#endif
+
+/* Tunable to choose whether to prioritise latency or throughput, simple
+ * binary yes or no */
+
+int sched_interactive __read_mostly = 1;
+
+/*
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
+ * are allowed to run five seconds as real time tasks. This is the total over
+ * all online cpus.
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+int sched_iso_cpu __read_mostly = 25;
+#else
+int sched_iso_cpu __read_mostly = 70;
+#endif
+
+/*
+ * The relative length of deadline for each priority(nice) level.
+ */
+static int prio_ratios[NICE_WIDTH] __read_mostly;
+
+/*
+ * The quota handed out to tasks of all priority levels when refilling their
+ * time_slice.
+ */
+static inline int timeslice(void)
+{
+	return MS_TO_US(rr_interval);
+}
+
+/*
+ * The global runqueue data that all CPUs work off. Data is protected either
+ * by the global grq lock, or the discrete lock that precedes the data in this
+ * struct.
+ */
+struct global_rq {
+	raw_spinlock_t lock;
+	unsigned long nr_running;
+	unsigned long nr_uninterruptible;
+	unsigned long long nr_switches;
+	struct list_head queue[PRIO_LIMIT];
+	DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
+	unsigned long qnr; /* queued not running */
+#ifdef CONFIG_SMP
+	cpumask_t cpu_idle_map;
+	bool idle_cpus;
+#endif
+	int noc; /* num_online_cpus stored and updated when it changes */
+	u64 niffies; /* Nanosecond jiffies */
+	unsigned long last_jiffy; /* Last jiffy we updated niffies */
+
+	raw_spinlock_t iso_lock;
+	int iso_ticks;
+	bool iso_refractory;
+};
+
+#ifdef CONFIG_SMP
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+	atomic_t refcount;
+	atomic_t rto_count;
+	struct rcu_head rcu;
+	cpumask_var_t span;
+	cpumask_var_t online;
+
+	/*
+	 * The "RT overload" flag: it gets set if a CPU has more than
+	 * one runnable RT task.
+	 */
+	cpumask_var_t rto_mask;
+	struct cpupri cpupri;
+};
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+#endif /* CONFIG_SMP */
+
+/* There can be only one */
+static struct global_rq grq;
+
+static DEFINE_MUTEX(sched_hotcpu_mutex);
+
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu)
+{
+	return &per_cpu(runqueues, (cpu));
+}
+#define task_rq(p)		cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+/*
+ * sched_domains_mutex serialises calls to init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+DEFINE_MUTEX(sched_domains_mutex);
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
+}
+#else
+struct rq *uprq;
+#endif /* CONFIG_SMP */
+
+static inline void update_rq_clock(struct rq *rq);
+
+/*
+ * Sanity check should sched_clock return bogus values. We make sure it does
+ * not appear to go backwards, and use jiffies to determine the maximum and
+ * minimum it could possibly have increased, and round down to the nearest
+ * jiffy when it falls outside this.
+ */
+static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
+{
+	unsigned long min_diff, max_diff;
+
+	if (jiff_diff > 1)
+		min_diff = JIFFIES_TO_NS(jiff_diff - 1);
+	else
+		min_diff = 1;
+	/*  Round up to the nearest tick for maximum */
+	max_diff = JIFFIES_TO_NS(jiff_diff + 1);
+
+	if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
+		*niff_diff = min_diff;
+}
+
+#ifdef CONFIG_SMP
+static inline int cpu_of(struct rq *rq)
+{
+	return rq->cpu;
+}
+
+/*
+ * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
+ * clock is updated with the grq.lock held, it is an opportunity to update the
+ * niffies value. Any CPU can update it by adding how much its clock has
+ * increased since it last updated niffies, minus any added niffies by other
+ * CPUs.
+ */
+static inline void update_clocks(struct rq *rq)
+{
+	s64 ndiff;
+	long jdiff;
+
+	update_rq_clock(rq);
+	ndiff = rq->clock - rq->old_clock;
+	/* old_clock is only updated when we are updating niffies */
+	rq->old_clock = rq->clock;
+	ndiff -= grq.niffies - rq->last_niffy;
+	jdiff = jiffies - grq.last_jiffy;
+	niffy_diff(&ndiff, jdiff);
+	grq.last_jiffy += jdiff;
+	grq.niffies += ndiff;
+	rq->last_niffy = grq.niffies;
+}
+#else /* CONFIG_SMP */
+static inline int cpu_of(struct rq *rq)
+{
+	return 0;
+}
+
+static inline void update_clocks(struct rq *rq)
+{
+	s64 ndiff;
+	long jdiff;
+
+	update_rq_clock(rq);
+	ndiff = rq->clock - rq->old_clock;
+	rq->old_clock = rq->clock;
+	jdiff = jiffies - grq.last_jiffy;
+	niffy_diff(&ndiff, jdiff);
+	grq.last_jiffy += jdiff;
+	grq.niffies += ndiff;
+}
+#endif
+
+#include "stats.h"
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)	do { } while (0)
+#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch()	do { } while (0)
+#endif
+
+/*
+ * All common locking functions performed on grq.lock. rq->clock is local to
+ * the CPU accessing it so it can be modified just with interrupts disabled
+ * when we're not updating niffies.
+ * Looking up task_rq must be done under grq.lock to be safe.
+ */
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
+static inline void update_rq_clock(struct rq *rq)
+{
+	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+
+	if (unlikely(delta < 0))
+		return;
+	rq->clock += delta;
+	update_rq_clock_task(rq, delta);
+}
+
+static inline bool task_running(struct task_struct *p)
+{
+	return p->on_cpu;
+}
+
+static inline void grq_lock(void)
+	__acquires(grq.lock)
+{
+	raw_spin_lock(&grq.lock);
+}
+
+static inline void grq_unlock(void)
+	__releases(grq.lock)
+{
+	raw_spin_unlock(&grq.lock);
+}
+
+static inline void grq_lock_irq(void)
+	__acquires(grq.lock)
+{
+	raw_spin_lock_irq(&grq.lock);
+}
+
+static inline void time_lock_grq(struct rq *rq)
+	__acquires(grq.lock)
+{
+	grq_lock();
+	update_clocks(rq);
+}
+
+static inline void grq_unlock_irq(void)
+	__releases(grq.lock)
+{
+	raw_spin_unlock_irq(&grq.lock);
+}
+
+static inline void grq_lock_irqsave(unsigned long *flags)
+	__acquires(grq.lock)
+{
+	raw_spin_lock_irqsave(&grq.lock, *flags);
+}
+
+static inline void grq_unlock_irqrestore(unsigned long *flags)
+	__releases(grq.lock)
+{
+	raw_spin_unlock_irqrestore(&grq.lock, *flags);
+}
+
+static inline struct rq
+*task_grq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(grq.lock)
+{
+	grq_lock_irqsave(flags);
+	return task_rq(p);
+}
+
+static inline struct rq
+*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(grq.lock)
+{
+	struct rq *rq = task_grq_lock(p, flags);
+	update_clocks(rq);
+	return rq;
+}
+
+static inline struct rq *task_grq_lock_irq(struct task_struct *p)
+	__acquires(grq.lock)
+{
+	grq_lock_irq();
+	return task_rq(p);
+}
+
+static inline void time_task_grq_lock_irq(struct task_struct *p)
+	__acquires(grq.lock)
+{
+	struct rq *rq = task_grq_lock_irq(p);
+	update_clocks(rq);
+}
+
+static inline void task_grq_unlock_irq(void)
+	__releases(grq.lock)
+{
+	grq_unlock_irq();
+}
+
+static inline void task_grq_unlock(unsigned long *flags)
+	__releases(grq.lock)
+{
+	grq_unlock_irqrestore(flags);
+}
+
+/**
+ * grunqueue_is_locked
+ *
+ * Returns true if the global runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+bool grunqueue_is_locked(void)
+{
+	return raw_spin_is_locked(&grq.lock);
+}
+
+void grq_unlock_wait(void)
+	__releases(grq.lock)
+{
+	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+	raw_spin_unlock_wait(&grq.lock);
+}
+
+static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
+	__acquires(grq.lock)
+{
+	local_irq_save(*flags);
+	time_lock_grq(rq);
+}
+
+static inline struct rq *__task_grq_lock(struct task_struct *p)
+	__acquires(grq.lock)
+{
+	grq_lock();
+	return task_rq(p);
+}
+
+static inline void __task_grq_unlock(void)
+	__releases(grq.lock)
+{
+	grq_unlock();
+}
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+	/* this is a valid case when another task releases the spinlock */
+	grq.lock.owner = current;
+#endif
+	/*
+	 * If we are tracking spinlock dependencies then we have to
+	 * fix up the runqueue lock - which gets 'carried over' from
+	 * prev into current:
+	 */
+	spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
+
+	grq_unlock_irq();
+}
+
+static inline bool deadline_before(u64 deadline, u64 time)
+{
+	return (deadline < time);
+}
+
+static inline bool deadline_after(u64 deadline, u64 time)
+{
+	return (deadline > time);
+}
+
+/*
+ * A task that is queued but not running will be on the grq run list.
+ * A task that is not running or queued will not be on the grq run list.
+ * A task that is currently running will have ->on_cpu set but not on the
+ * grq run list.
+ */
+static inline bool task_queued(struct task_struct *p)
+{
+	return (!list_empty(&p->run_list));
+}
+
+/*
+ * Removing from the global runqueue. Enter with grq locked.
+ */
+static void dequeue_task(struct task_struct *p)
+{
+	list_del_init(&p->run_list);
+	if (list_empty(grq.queue + p->prio))
+		__clear_bit(p->prio, grq.prio_bitmap);
+	sched_info_dequeued(task_rq(p), p);
+}
+
+/*
+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
+ * an idle task, we ensure none of the following conditions are met.
+ */
+static bool idleprio_suitable(struct task_struct *p)
+{
+	return (!freezing(p) && !signal_pending(p) &&
+		!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
+}
+
+/*
+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
+ * that the iso_refractory flag is not set.
+ */
+static bool isoprio_suitable(void)
+{
+	return !grq.iso_refractory;
+}
+
+/*
+ * Adding to the global runqueue. Enter with grq locked.
+ */
+static void enqueue_task(struct task_struct *p, struct rq *rq)
+{
+	if (!rt_task(p)) {
+		/* Check it hasn't gotten rt from PI */
+		if ((idleprio_task(p) && idleprio_suitable(p)) ||
+		   (iso_task(p) && isoprio_suitable()))
+			p->prio = p->normal_prio;
+		else
+			p->prio = NORMAL_PRIO;
+	}
+	__set_bit(p->prio, grq.prio_bitmap);
+	list_add_tail(&p->run_list, grq.queue + p->prio);
+	sched_info_queued(rq, p);
+}
+
+static inline void requeue_task(struct task_struct *p)
+{
+	sched_info_queued(task_rq(p), p);
+}
+
+/*
+ * Returns the relative length of deadline all compared to the shortest
+ * deadline which is that of nice -20.
+ */
+static inline int task_prio_ratio(struct task_struct *p)
+{
+	return prio_ratios[TASK_USER_PRIO(p)];
+}
+
+/*
+ * task_timeslice - all tasks of all priorities get the exact same timeslice
+ * length. CPU distribution is handled by giving different deadlines to
+ * tasks of different priorities. Use 128 as the base value for fast shifts.
+ */
+static inline int task_timeslice(struct task_struct *p)
+{
+	return (rr_interval * task_prio_ratio(p) / 128);
+}
+
+static void resched_task(struct task_struct *p);
+
+static inline void resched_curr(struct rq *rq)
+{
+	resched_task(rq->curr);
+}
+
+/*
+ * qnr is the "queued but not running" count which is the total number of
+ * tasks on the global runqueue list waiting for cpu time but not actually
+ * currently running on a cpu.
+ */
+static inline void inc_qnr(void)
+{
+	grq.qnr++;
+}
+
+static inline void dec_qnr(void)
+{
+	grq.qnr--;
+}
+
+static inline int queued_notrunning(void)
+{
+	return grq.qnr;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
+ * allow easy lookup of whether any suitable idle CPUs are available.
+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
+ * idle_cpus variable than to do a full bitmask check when we are busy.
+ */
+static inline void set_cpuidle_map(int cpu)
+{
+	if (likely(cpu_online(cpu))) {
+		cpumask_set_cpu(cpu, &grq.cpu_idle_map);
+		grq.idle_cpus = true;
+	}
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+	cpumask_clear_cpu(cpu, &grq.cpu_idle_map);
+	if (cpumask_empty(&grq.cpu_idle_map))
+		grq.idle_cpus = false;
+}
+
+static bool suitable_idle_cpus(struct task_struct *p)
+{
+	if (!grq.idle_cpus)
+		return false;
+	return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
+}
+
+#define CPUIDLE_DIFF_THREAD	(1)
+#define CPUIDLE_DIFF_CORE	(2)
+#define CPUIDLE_CACHE_BUSY	(4)
+#define CPUIDLE_DIFF_CPU	(8)
+#define CPUIDLE_THREAD_BUSY	(16)
+#define CPUIDLE_THROTTLED	(32)
+#define CPUIDLE_DIFF_NODE	(64)
+
+static inline bool scaling_rq(struct rq *rq);
+
+/*
+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the
+ * lowest value would give the most suitable CPU to schedule p onto next. The
+ * order works out to be the following:
+ *
+ * Same core, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+ */
+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
+{
+	int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THROTTLED |
+		CPUIDLE_THREAD_BUSY | CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY |
+		CPUIDLE_DIFF_CORE | CPUIDLE_DIFF_THREAD;
+	int cpu_tmp;
+
+	if (cpumask_test_cpu(best_cpu, tmpmask))
+		goto out;
+
+	for_each_cpu(cpu_tmp, tmpmask) {
+		int ranking, locality;
+		struct rq *tmp_rq;
+
+		ranking = 0;
+		tmp_rq = cpu_rq(cpu_tmp);
+
+		locality = rq->cpu_locality[cpu_tmp];
+#ifdef CONFIG_NUMA
+		if (locality > 3)
+			ranking |= CPUIDLE_DIFF_NODE;
+		else
+#endif
+		if (locality > 2)
+			ranking |= CPUIDLE_DIFF_CPU;
+#ifdef CONFIG_SCHED_MC
+		else if (locality == 2)
+			ranking |= CPUIDLE_DIFF_CORE;
+		if (!(tmp_rq->cache_idle(cpu_tmp)))
+			ranking |= CPUIDLE_CACHE_BUSY;
+#endif
+#ifdef CONFIG_SCHED_SMT
+		if (locality == 1)
+			ranking |= CPUIDLE_DIFF_THREAD;
+		if (!(tmp_rq->siblings_idle(cpu_tmp)))
+			ranking |= CPUIDLE_THREAD_BUSY;
+#endif
+		if (scaling_rq(tmp_rq))
+			ranking |= CPUIDLE_THROTTLED;
+
+		if (ranking < best_ranking) {
+			best_cpu = cpu_tmp;
+			best_ranking = ranking;
+		}
+	}
+out:
+	return best_cpu;
+}
+
+static void resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
+{
+	best_cpu = best_mask_cpu(best_cpu, rq, tmpmask);
+	resched_curr(cpu_rq(best_cpu));
+}
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+	struct rq *this_rq = cpu_rq(this_cpu);
+
+	return (this_rq->cpu_locality[that_cpu] < 3);
+}
+
+#ifdef CONFIG_SCHED_SMT
+#ifdef CONFIG_SMT_NICE
+static const cpumask_t *thread_cpumask(int cpu);
+
+/* Find the best real time priority running on any SMT siblings of cpu and if
+ * none are running, the static priority of the best deadline task running.
+ * The lookups to the other runqueues is done lockless as the occasional wrong
+ * value would be harmless. */
+static int best_smt_bias(int cpu)
+{
+	int other_cpu, best_bias = 0;
+
+	for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+		struct rq *rq;
+
+		if (other_cpu == cpu)
+			continue;
+		rq = cpu_rq(other_cpu);
+		if (rq_idle(rq))
+			continue;
+		if (!rq->online)
+			continue;
+		if (!rq->rq_mm)
+			continue;
+		if (likely(rq->rq_smt_bias > best_bias))
+			best_bias = rq->rq_smt_bias;
+	}
+	return best_bias;
+}
+
+static int task_prio_bias(struct task_struct *p)
+{
+	if (rt_task(p))
+		return 1 << 30;
+	else if (task_running_iso(p))
+		return 1 << 29;
+	else if (task_running_idle(p))
+		return 0;
+	return MAX_PRIO - p->static_prio;
+}
+
+/* We've already decided p can run on CPU, now test if it shouldn't for SMT
+ * nice reasons. */
+static bool smt_should_schedule(struct task_struct *p, int cpu)
+{
+	int best_bias, task_bias;
+
+	/* Kernel threads always run */
+	if (unlikely(!p->mm))
+		return true;
+	if (rt_task(p))
+		return true;
+	if (!idleprio_suitable(p))
+		return true;
+	best_bias = best_smt_bias(cpu);
+	/* The smt siblings are all idle or running IDLEPRIO */
+	if (best_bias < 1)
+		return true;
+	task_bias = task_prio_bias(p);
+	if (task_bias < 1)
+		return false;
+	if (task_bias >= best_bias)
+		return true;
+	/* Dither 25% cpu of normal tasks regardless of nice difference */
+	if (best_bias % 4 == 1)
+		return true;
+	/* Sorry, you lose */
+	return false;
+}
+#endif
+#endif
+
+static bool resched_best_idle(struct task_struct *p)
+{
+	cpumask_t tmpmask;
+	int best_cpu;
+
+	cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
+	best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask);
+#ifdef CONFIG_SMT_NICE
+	if (!smt_should_schedule(p, best_cpu))
+		return false;
+#endif
+	resched_curr(cpu_rq(best_cpu));
+	return true;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+	if (suitable_idle_cpus(p))
+		resched_best_idle(p);
+}
+/*
+ * Flags to tell us whether this CPU is running a CPU frequency governor that
+ * has slowed its speed or not. No locking required as the very rare wrongly
+ * read value would be harmless.
+ */
+void cpu_scaling(int cpu)
+{
+	cpu_rq(cpu)->scaling = true;
+}
+
+void cpu_nonscaling(int cpu)
+{
+	cpu_rq(cpu)->scaling = false;
+}
+
+static inline bool scaling_rq(struct rq *rq)
+{
+	return rq->scaling;
+}
+
+static inline int locality_diff(int cpu, struct rq *rq)
+{
+	return rq->cpu_locality[cpu];
+}
+#else /* CONFIG_SMP */
+static inline void set_cpuidle_map(int cpu)
+{
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+}
+
+static inline bool suitable_idle_cpus(struct task_struct *p)
+{
+	return uprq->curr == uprq->idle;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+}
+
+void cpu_scaling(int __unused)
+{
+}
+
+void cpu_nonscaling(int __unused)
+{
+}
+
+/*
+ * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
+ * always returns 0.
+ */
+static inline bool scaling_rq(struct rq *rq)
+{
+	return false;
+}
+
+static inline int locality_diff(int cpu, struct rq *rq)
+{
+	return 0;
+}
+#endif /* CONFIG_SMP */
+EXPORT_SYMBOL_GPL(cpu_scaling);
+EXPORT_SYMBOL_GPL(cpu_nonscaling);
+
+static inline int normal_prio(struct task_struct *p)
+{
+	if (has_rt_policy(p))
+		return MAX_RT_PRIO - 1 - p->rt_priority;
+	if (idleprio_task(p))
+		return IDLE_PRIO;
+	if (iso_task(p))
+		return ISO_PRIO;
+	return NORMAL_PRIO;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks as it will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+	p->normal_prio = normal_prio(p);
+	/*
+	 * If we are RT tasks or we were boosted to RT priority,
+	 * keep the priority unchanged. Otherwise, update priority
+	 * to the normal priority:
+	 */
+	if (!rt_prio(p->prio))
+		return p->normal_prio;
+	return p->prio;
+}
+
+/*
+ * activate_task - move a task to the runqueue. Enter with grq locked.
+ */
+static void activate_task(struct task_struct *p, struct rq *rq)
+{
+	update_clocks(rq);
+
+	/*
+	 * Sleep time is in units of nanosecs, so shift by 20 to get a
+	 * milliseconds-range estimation of the amount of time that the task
+	 * spent sleeping:
+	 */
+	if (unlikely(prof_on == SLEEP_PROFILING)) {
+		if (p->state == TASK_UNINTERRUPTIBLE)
+			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+				     (rq->clock_task - p->last_ran) >> 20);
+	}
+
+	p->prio = effective_prio(p);
+	if (task_contributes_to_load(p))
+		grq.nr_uninterruptible--;
+	enqueue_task(p, rq);
+	rq->soft_affined++;
+	p->on_rq = 1;
+	grq.nr_running++;
+	inc_qnr();
+}
+
+static inline void clear_sticky(struct task_struct *p);
+
+/*
+ * deactivate_task - If it's running, it's not on the grq and we can just
+ * decrement the nr_running. Enter with grq locked.
+ */
+static inline void deactivate_task(struct task_struct *p, struct rq *rq)
+{
+	if (task_contributes_to_load(p))
+		grq.nr_uninterruptible++;
+	rq->soft_affined--;
+	p->on_rq = 0;
+	grq.nr_running--;
+	clear_sticky(p);
+}
+
+#ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * The caller should hold grq lock.
+	 */
+	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+#endif
+	if (task_cpu(p) == cpu)
+		return;
+	trace_sched_migrate_task(p, cpu);
+	perf_event_task_migrate(p);
+
+	/*
+	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
+	 * successfully executed on another CPU. We must ensure that updates of
+	 * per-task data have been completed by this moment.
+	 */
+	smp_wmb();
+	if (p->on_rq) {
+		task_rq(p)->soft_affined--;
+		cpu_rq(cpu)->soft_affined++;
+	}
+	task_thread_info(p)->cpu = cpu;
+}
+
+static inline void clear_sticky(struct task_struct *p)
+{
+	p->sticky = false;
+}
+
+static inline bool task_sticky(struct task_struct *p)
+{
+	return p->sticky;
+}
+
+/* Reschedule the best idle CPU that is not this one. */
+static void
+resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p)
+{
+	cpumask_t tmpmask;
+
+	cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
+	cpumask_clear_cpu(cpu, &tmpmask);
+	if (cpumask_empty(&tmpmask))
+		return;
+	resched_best_mask(cpu, rq, &tmpmask);
+}
+
+/*
+ * We set the sticky flag on a task that is descheduled involuntarily meaning
+ * it is awaiting further CPU time. If the last sticky task is still sticky
+ * but unlucky enough to not be the next task scheduled, we unstick it and try
+ * to find it an idle CPU. Realtime tasks do not stick to minimise their
+ * latency at all times.
+ */
+static inline void
+swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
+{
+	if (rq->sticky_task) {
+		if (rq->sticky_task == p) {
+			p->sticky = true;
+			return;
+		}
+		if (task_sticky(rq->sticky_task)) {
+			clear_sticky(rq->sticky_task);
+			resched_closest_idle(rq, cpu, rq->sticky_task);
+		}
+	}
+	if (!rt_task(p)) {
+		p->sticky = true;
+		rq->sticky_task = p;
+	} else {
+		resched_closest_idle(rq, cpu, p);
+		rq->sticky_task = NULL;
+	}
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+	rq->sticky_task = NULL;
+	clear_sticky(p);
+}
+#else
+static inline void clear_sticky(struct task_struct *p)
+{
+}
+
+static inline bool task_sticky(struct task_struct *p)
+{
+	return false;
+}
+
+static inline void
+swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
+{
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
+/*
+ * Move a task off the global queue and take it to a cpu for it will
+ * become the running task.
+ */
+static inline void take_task(int cpu, struct task_struct *p)
+{
+	set_task_cpu(p, cpu);
+	dequeue_task(p);
+	clear_sticky(p);
+	dec_qnr();
+}
+
+/*
+ * Returns a descheduling task to the grq runqueue unless it is being
+ * deactivated.
+ */
+static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate)
+{
+	if (deactivate)
+		deactivate_task(p, rq);
+	else {
+		inc_qnr();
+		enqueue_task(p, rq);
+	}
+}
+
+/* Enter with grq lock held. We know p is on the local cpu */
+static inline void __set_tsk_resched(struct task_struct *p)
+{
+	set_tsk_need_resched(p);
+	set_preempt_need_resched();
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
+{
+	int cpu;
+
+	lockdep_assert_held(&grq.lock);
+
+	if (test_tsk_need_resched(p))
+		return;
+
+	set_tsk_need_resched(p);
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id()) {
+		set_preempt_need_resched();
+		return;
+	}
+
+	smp_send_reschedule(cpu);
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+	return cpu_curr(task_cpu(p)) == p;
+}
+
+#ifdef CONFIG_SMP
+struct migration_req {
+	struct task_struct *task;
+	int dest_cpu;
+};
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+	unsigned long flags;
+	bool running, on_rq;
+	unsigned long ncsw;
+	struct rq *rq;
+
+	for (;;) {
+		rq = task_rq(p);
+
+		/*
+		 * If the task is actively running on another CPU
+		 * still, just relax and busy-wait without holding
+		 * any locks.
+		 *
+		 * NOTE! Since we don't hold any locks, it's not
+		 * even sure that "rq" stays as the right runqueue!
+		 * But we don't care, since this will return false
+		 * if the runqueue has changed and p is actually now
+		 * running somewhere else!
+		 */
+		while (task_running(p) && p == rq->curr) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
+			cpu_relax();
+		}
+
+		/*
+		 * Ok, time to look more closely! We need the grq
+		 * lock now, to be *sure*. If we're wrong, we'll
+		 * just go back and repeat.
+		 */
+		rq = task_grq_lock(p, &flags);
+		trace_sched_wait_task(p);
+		running = task_running(p);
+		on_rq = p->on_rq;
+		ncsw = 0;
+		if (!match_state || p->state == match_state)
+			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+		task_grq_unlock(&flags);
+
+		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
+		/*
+		 * Was it really running after all now that we
+		 * checked with the proper locks actually held?
+		 *
+		 * Oops. Go back and try again..
+		 */
+		if (unlikely(running)) {
+			cpu_relax();
+			continue;
+		}
+
+		/*
+		 * It's not enough that it's not actively running,
+		 * it must be off the runqueue _entirely_, and not
+		 * preempted!
+		 *
+		 * So if it was still runnable (but just not actively
+		 * running right now), it's preempted, and we should
+		 * yield - it could be a while.
+		 */
+		if (unlikely(on_rq)) {
+			ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
+
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+			continue;
+		}
+
+		/*
+		 * Ahh, all good. It wasn't running, and it wasn't
+		 * runnable, which means that it will never become
+		 * running in the future either. We're all done!
+		 */
+		break;
+	}
+
+	return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+	int cpu;
+
+	preempt_disable();
+	cpu = task_cpu(p);
+	if ((cpu != smp_processor_id()) && task_curr(p))
+		smp_send_reschedule(cpu);
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+#endif
+
+/*
+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
+ * between themselves, they cooperatively multitask. An idle rq scores as
+ * prio PRIO_LIMIT so it is always preempted.
+ */
+static inline bool
+can_preempt(struct task_struct *p, int prio, u64 deadline)
+{
+	/* Better static priority RT task or better policy preemption */
+	if (p->prio < prio)
+		return true;
+	if (p->prio > prio)
+		return false;
+	/* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
+	if (!deadline_before(p->deadline, deadline))
+		return false;
+	return true;
+}
+
+#ifdef CONFIG_SMP
+#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Check to see if there is a task that is affined only to offline CPUs but
+ * still wants runtime. This happens to kernel threads during suspend/halt and
+ * disabling of CPUs.
+ */
+static inline bool online_cpus(struct task_struct *p)
+{
+	return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed)));
+}
+#else /* CONFIG_HOTPLUG_CPU */
+/* All available CPUs are always online without hotplug. */
+static inline bool online_cpus(struct task_struct *p)
+{
+	return true;
+}
+#endif
+
+/*
+ * Check to see if p can run on cpu, and if not, whether there are any online
+ * CPUs it can run on instead.
+ */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
+		return true;
+	return false;
+}
+
+/*
+ * When all else is equal, still prefer this_rq.
+ */
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+	struct rq *highest_prio_rq = NULL;
+	int cpu, highest_prio;
+	u64 latest_deadline;
+	cpumask_t tmp;
+
+	/*
+	 * We clear the sticky flag here because for a task to have called
+	 * try_preempt with the sticky flag enabled means some complicated
+	 * re-scheduling has occurred and we should ignore the sticky flag.
+	 */
+	clear_sticky(p);
+
+	if (suitable_idle_cpus(p) && resched_best_idle(p))
+		return;
+
+	/* IDLEPRIO tasks never preempt anything but idle */
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+
+	if (likely(online_cpus(p)))
+		cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
+	else
+		return;
+
+	highest_prio = latest_deadline = 0;
+
+	for_each_cpu(cpu, &tmp) {
+		struct rq *rq;
+		int rq_prio;
+
+		rq = cpu_rq(cpu);
+		rq_prio = rq->rq_prio;
+		if (rq_prio < highest_prio)
+			continue;
+
+		if (rq_prio > highest_prio ||
+		    deadline_after(rq->rq_deadline, latest_deadline)) {
+			latest_deadline = rq->rq_deadline;
+			highest_prio = rq_prio;
+			highest_prio_rq = rq;
+		}
+	}
+
+	if (likely(highest_prio_rq)) {
+#ifdef CONFIG_SMT_NICE
+		cpu = cpu_of(highest_prio_rq);
+		if (!smt_should_schedule(p, cpu))
+			return;
+#endif
+		if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline))
+			resched_curr(highest_prio_rq);
+	}
+}
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check);
+#else /* CONFIG_SMP */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	return false;
+}
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+	if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
+		resched_curr(uprq);
+}
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+					 const struct cpumask *new_mask, bool check)
+{
+	return set_cpus_allowed_ptr(p, new_mask);
+}
+#endif /* CONFIG_SMP */
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+#ifdef CONFIG_SCHEDSTATS
+	struct rq *rq = this_rq();
+
+#ifdef CONFIG_SMP
+	int this_cpu = smp_processor_id();
+
+	if (cpu == this_cpu)
+		schedstat_inc(rq, ttwu_local);
+	else {
+		struct sched_domain *sd;
+
+		rcu_read_lock();
+		for_each_domain(this_cpu, sd) {
+			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+				schedstat_inc(sd, ttwu_wake_remote);
+				break;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+#endif /* CONFIG_SMP */
+
+	schedstat_inc(rq, ttwu_count);
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+void wake_up_if_idle(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	rcu_read_lock();
+
+	if (!is_idle_task(rcu_dereference(rq->curr)))
+		goto out;
+
+	grq_lock_irqsave(&flags);
+	if (likely(is_idle_task(rq->curr)))
+		smp_send_reschedule(cpu);
+	/* Else cpu is not in idle, do nothing here */
+	grq_unlock_irqrestore(&flags);
+
+out:
+	rcu_read_unlock();
+}
+
+#ifdef CONFIG_SMP
+void scheduler_ipi(void)
+{
+	/*
+	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+	 * TIF_NEED_RESCHED remotely (for the first time) will also send
+	 * this IPI.
+	 */
+	preempt_fold_need_resched();
+}
+#endif
+
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+				 bool is_sync)
+{
+	activate_task(p, rq);
+
+	/*
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption if there are no idle cpus,
+	 * instead waiting for current to deschedule.
+	 */
+	if (!is_sync || suitable_idle_cpus(p))
+		try_preempt(p, rq);
+}
+
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+					bool success)
+{
+	trace_sched_wakeup(p);
+	p->state = TASK_RUNNING;
+
+	/*
+	 * if a worker is waking up, notify workqueue. Note that on BFS, we
+	 * don't really know what cpu it will be, so we fake it for
+	 * wq_worker_waking_up :/
+	 */
+	if ((p->flags & PF_WQ_WORKER) && success)
+		wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/*
+ * wake flags
+ */
+#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
+#define WF_FORK		0x02		/* child wakeup after fork */
+#define WF_MIGRATED	0x4		/* internal use, task got migrated */
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * Return: %true if @p was woken up, %false if it was already running.
+ * or @state didn't match @p's state.
+ */
+static bool try_to_wake_up(struct task_struct *p, unsigned int state,
+			  int wake_flags)
+{
+	bool success = false;
+	unsigned long flags;
+	struct rq *rq;
+	int cpu;
+
+	get_cpu();
+
+	/*
+	 * If we are going to wake up a thread waiting for CONDITION we
+	 * need to ensure that CONDITION=1 done by the caller can not be
+	 * reordered with p->state check below. This pairs with mb() in
+	 * set_current_state() the waiting thread does.
+	 */
+	smp_mb__before_spinlock();
+
+	/*
+	 * No need to do time_lock_grq as we only need to update the rq clock
+	 * if we activate the task
+	 */
+	rq = task_grq_lock(p, &flags);
+	cpu = task_cpu(p);
+
+	/* state is a volatile long, ã©ã†ã—ã¦ã€åˆ†ã‹ã‚‰ãªã„ */
+	if (!((unsigned int)p->state & state))
+		goto out_unlock;
+
+	trace_sched_waking(p);
+
+	if (task_queued(p) || task_running(p))
+		goto out_running;
+
+	ttwu_activate(p, rq, wake_flags & WF_SYNC);
+	success = true;
+
+out_running:
+	ttwu_post_activation(p, rq, success);
+out_unlock:
+	task_grq_unlock(&flags);
+
+	ttwu_stat(p, cpu, wake_flags);
+
+	put_cpu();
+
+	return success;
+}
+
+/**
+ * try_to_wake_up_local - try to wake up a local task with grq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not already there. The caller must
+ * ensure that grq is locked and, @p is not the current task.
+ * grq stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+	bool success = false;
+
+	lockdep_assert_held(&grq.lock);
+
+	if (!(p->state & TASK_NORMAL))
+		return;
+
+	trace_sched_waking(p);
+
+	if (!task_queued(p)) {
+		if (likely(!task_running(p))) {
+			schedstat_inc(rq, ttwu_count);
+			schedstat_inc(rq, ttwu_local);
+		}
+		ttwu_activate(p, rq, false);
+		ttwu_stat(p, smp_processor_id(), 0);
+		success = true;
+	}
+	ttwu_post_activation(p, rq, success);
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process(struct task_struct *p)
+{
+	return try_to_wake_up(p, TASK_NORMAL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+	return try_to_wake_up(p, state, 0);
+}
+
+static void time_slice_expired(struct task_struct *p);
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+	/*
+	 * The process state is set to the same value of the process executing
+	 * do_fork() code. That is running. This guarantees that nobody will
+	 * actually run it, and a signal or other external event cannot wake
+	 * it up and insert it on the runqueue either.
+	 */
+
+	/* Should be reset in fork.c but done here for ease of bfs patching */
+	p->on_rq =
+	p->utime =
+	p->stime =
+	p->utimescaled =
+	p->stimescaled =
+	p->sched_time =
+	p->stime_pc =
+	p->utime_pc = 0;
+
+	/*
+	 * Revert to default priority/policy on fork if requested.
+	 */
+	if (unlikely(p->sched_reset_on_fork)) {
+		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+			p->policy = SCHED_NORMAL;
+			p->normal_prio = normal_prio(p);
+		}
+
+		if (PRIO_TO_NICE(p->static_prio) < 0) {
+			p->static_prio = NICE_TO_PRIO(0);
+			p->normal_prio = p->static_prio;
+		}
+
+		/*
+		 * We don't need the reset flag anymore after the fork. It has
+		 * fulfilled its duty:
+		 */
+		p->sched_reset_on_fork = 0;
+	}
+
+	INIT_LIST_HEAD(&p->run_list);
+#ifdef CONFIG_SCHED_INFO
+	if (unlikely(sched_info_on()))
+		memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+	p->on_cpu = false;
+	clear_sticky(p);
+	init_task_preempt_count(p);
+	return 0;
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+	struct task_struct *parent;
+	unsigned long flags;
+	struct rq *rq;
+
+	parent = p->parent;
+	rq = task_grq_lock(p, &flags);
+
+	/*
+	 * Reinit new task deadline as its creator deadline could have changed
+	 * since call to dup_task_struct().
+	 */
+	p->deadline = rq->rq_deadline;
+
+	/*
+	 * If the task is a new process, current and parent are the same. If
+	 * the task is a new thread in the thread group, it will have much more
+	 * in common with current than with the parent.
+	 */
+	set_task_cpu(p, task_cpu(rq->curr));
+
+	/*
+	 * Make sure we do not leak PI boosting priority to the child.
+	 */
+	p->prio = rq->curr->normal_prio;
+
+	activate_task(p, rq);
+	trace_sched_wakeup_new(p);
+	if (unlikely(p->policy == SCHED_FIFO))
+		goto after_ts_init;
+
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness. If it's negative, it won't
+	 * matter since that's the same as being 0. current's time_slice is
+	 * actually in rq_time_slice when it's running, as is its last_ran
+	 * value. rq->rq_deadline is only modified within schedule() so it
+	 * is always equal to current->deadline.
+	 */
+	p->last_ran = rq->rq_last_ran;
+	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+		rq->rq_time_slice /= 2;
+		p->time_slice = rq->rq_time_slice;
+after_ts_init:
+		if (rq->curr == parent && !suitable_idle_cpus(p)) {
+			/*
+			 * The VM isn't cloned, so we're in a good position to
+			 * do child-runs-first in anticipation of an exec. This
+			 * usually avoids a lot of COW overhead.
+			 */
+			__set_tsk_resched(parent);
+		} else
+			try_preempt(p, rq);
+	} else {
+		if (rq->curr == parent) {
+			/*
+			* Forking task has run out of timeslice. Reschedule it and
+			* start its child with a new time slice and deadline. The
+			* child will end up running first because its deadline will
+			* be slightly earlier.
+			*/
+			rq->rq_time_slice = 0;
+			__set_tsk_resched(parent);
+		}
+		time_slice_expired(p);
+	}
+	task_grq_unlock(&flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
+void preempt_notifier_inc(void)
+{
+	static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+	static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+	if (!static_key_false(&preempt_notifier_key))
+		WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
+	hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is *not* safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+	hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	struct preempt_notifier *notifier;
+
+	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+		notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	if (static_key_false(&preempt_notifier_key))
+		__fire_sched_in_preempt_notifiers(curr);
+}
+
+static void
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	struct preempt_notifier *notifier;
+
+	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+		notifier->ops->sched_out(notifier, next);
+}
+
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	if (static_key_false(&preempt_notifier_key))
+		__fire_sched_out_preempt_notifiers(curr, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+		    struct task_struct *next)
+{
+	sched_info_switch(rq, prev, next);
+	perf_event_task_sched_out(prev, next);
+	fire_sched_out_preempt_notifiers(prev, next);
+	prepare_lock_switch(rq, next);
+	prepare_arch_switch(next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock.  (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
+ */
+static struct rq *finish_task_switch(struct task_struct *prev)
+	__releases(grq.lock)
+{
+	struct rq *rq = this_rq();
+	struct mm_struct *mm = rq->prev_mm;
+	long prev_state;
+
+	/*
+	 * The previous task will have left us with a preempt_count of 2
+	 * because it left us after:
+	 *
+	 *	schedule()
+	 *	  preempt_disable();			// 1
+	 *	  __schedule()
+	 *	    raw_spin_lock_irq(&rq->lock)	// 2
+	 *
+	 * Also, see FORK_PREEMPT_COUNT.
+	 */
+	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+		      "corrupted preempt_count: %s/%d/0x%x\n",
+		      current->comm, current->pid, preempt_count()))
+		preempt_count_set(FORK_PREEMPT_COUNT);
+
+	rq->prev_mm = NULL;
+
+	/*
+	 * A task struct has one reference for the use as "current".
+	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+	 * schedule one last time. The schedule call will never return, and
+	 * the scheduled task must drop that reference.
+	 *
+	 * We must observe prev->state before clearing prev->on_cpu (in
+	 * finish_lock_switch), otherwise a concurrent wakeup can get prev
+	 * running on another CPU and we could rave with its RUNNING -> DEAD
+	 * transition, resulting in a double drop.
+	 */
+	prev_state = prev->state;
+	vtime_task_switch(prev);
+	perf_event_task_sched_in(prev, current);
+	finish_lock_switch(rq, prev);
+	finish_arch_post_lock_switch();
+
+	fire_sched_in_preempt_notifiers(current);
+	if (mm)
+		mmdrop(mm);
+	if (unlikely(prev_state == TASK_DEAD)) {
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+		 */
+		kprobe_flush_task(prev);
+		put_task_struct(prev);
+	}
+	return rq;
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
+	__releases(grq.lock)
+{
+	struct rq *rq;
+
+	/*
+	 * New tasks start with FORK_PREEMPT_COUNT, see there and
+	 * finish_task_switch() for details.
+	 *
+	 * finish_task_switch() will drop rq->lock() and lower preempt_count
+	 * and the preempt_enable() will end up enabling preemption (on
+	 * PREEMPT_COUNT kernels).
+	 */
+
+	rq = finish_task_switch(prev);
+	preempt_enable();
+
+	if (current->set_child_tid)
+		put_user(task_pid_vnr(current), current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	struct mm_struct *mm, *oldmm;
+
+	prepare_task_switch(rq, prev, next);
+
+	mm = next->mm;
+	oldmm = prev->active_mm;
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (!mm) {
+		next->active_mm = oldmm;
+		atomic_inc(&oldmm->mm_count);
+		enter_lazy_tlb(oldmm, next);
+	} else
+		switch_mm(oldmm, mm, next);
+
+	if (!prev->mm) {
+		prev->active_mm = NULL;
+		rq->prev_mm = oldmm;
+	}
+	/*
+	 * Since the runqueue lock will be released by the next
+	 * task (which is an invalid locking op but in the case
+	 * of the scheduler it's an obvious special-case), so we
+	 * do an early lockdep release here:
+	 */
+	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+
+	/* Here we just switch the register state and the stack. */
+	switch_to(prev, next, prev);
+	barrier();
+
+	return finish_task_switch(prev);
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup. All are
+ * measured without grabbing the grq lock but the occasional inaccurate result
+ * doesn't matter so long as it's positive.
+ */
+unsigned long nr_running(void)
+{
+	long nr = grq.nr_running;
+
+	if (unlikely(nr < 0))
+		nr = 0;
+	return (unsigned long)nr;
+}
+
+static unsigned long nr_uninterruptible(void)
+{
+	long nu = grq.nr_uninterruptible;
+
+	if (unlikely(nu < 0))
+		nu = 0;
+	return nu;
+}
+
+/*
+ * Check if only the current task is running on the cpu.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race.  The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
+ */
+bool single_task_running(void)
+{
+	if (cpu_rq(smp_processor_id())->soft_affined == 1)
+		return true;
+	else
+		return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
+unsigned long long nr_context_switches(void)
+{
+	long long ns = grq.nr_switches;
+
+	/* This is of course impossible */
+	if (unlikely(ns < 0))
+		ns = 1;
+	return (unsigned long long)ns;
+}
+
+unsigned long nr_iowait(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+	return sum;
+}
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+	struct rq *this = cpu_rq(cpu);
+	return atomic_read(&this->nr_iowait);
+}
+
+unsigned long nr_active(void)
+{
+	return nr_running() + nr_uninterruptible();
+}
+
+/* Beyond a task running on this CPU, load is equal everywhere on BFS, so we
+ * base it on the number of running or queued tasks with their ->rq pointer
+ * set to this cpu as being the CPU they're more likely to run on. */
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+	struct rq *rq = this_rq();
+
+	*nr_waiters = atomic_read(&rq->nr_iowait);
+	*load = rq->soft_affined;
+}
+
+/* Variables and functions for calc_load */
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	return load >> FSHIFT;
+}
+
+/*
+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
+ */
+void calc_global_load(unsigned long ticks)
+{
+	long active;
+
+	if (time_before(jiffies, calc_load_update))
+		return;
+	active = nr_active() * FIXED_1;
+
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+	calc_load_update = jiffies + LOAD_FREQ;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+	__this_cpu_inc(irq_time_seq.sequence);
+	smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+	smp_wmb();
+	__this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	u64 irq_time;
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+		irq_time = per_cpu(cpu_softirq_time, cpu) +
+			   per_cpu(cpu_hardirq_time, cpu);
+	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+	return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void irqtime_account_irq(struct task_struct *curr)
+{
+	unsigned long flags;
+	s64 delta;
+	int cpu;
+
+	if (!sched_clock_irqtime)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+	__this_cpu_add(irq_start_time, delta);
+
+	irq_time_write_begin();
+	/*
+	 * We do not account for softirq time from ksoftirqd here.
+	 * We want to continue accounting softirq time to ksoftirqd thread
+	 * in that case, so as not to confuse scheduler with a special task
+	 * that do not consume any time, but still wants to run.
+	 */
+	if (hardirq_count())
+		__this_cpu_add(cpu_hardirq_time, delta);
+	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+		__this_cpu_add(cpu_softirq_time, delta);
+
+	irq_time_write_end();
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(irqtime_account_irq);
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+	if (unlikely(steal > NSEC_PER_SEC))
+		return div_u64(steal, TICK_NSEC);
+
+	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+	/*
+	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
+	 * this case when a previous update_rq_clock() happened inside a
+	 * {soft,}irq region.
+	 *
+	 * When this happens, we stop ->clock_task and only update the
+	 * prev_irq_time stamp to account for the part that fit, so that a next
+	 * update will consume the rest. This ensures ->clock_task is
+	 * monotonic.
+	 *
+	 * It does however cause some slight miss-attribution of {soft,}irq
+	 * time, a more accurate solution would be to update the irq_time using
+	 * the current rq->clock timestamp, except that would require using
+	 * atomic ops.
+	 */
+	if (irq_delta > delta)
+		irq_delta = delta;
+
+	rq->prev_irq_time += irq_delta;
+	delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+	if (static_key_false((&paravirt_steal_rq_enabled))) {
+		s64 steal = paravirt_steal_clock(cpu_of(rq));
+
+		steal -= rq->prev_steal_time_rq;
+
+		if (unlikely(steal > delta))
+			steal = delta;
+
+		rq->prev_steal_time_rq += steal;
+
+		delta -= steal;
+	}
+#endif
+
+	rq->clock_task += delta;
+}
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static void irqtime_account_hi_si(void)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	u64 latest_ns;
+
+	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
+	if (latest_ns > cpustat[CPUTIME_IRQ])
+		cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
+
+	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
+	if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
+		cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime	(0)
+
+static inline void irqtime_account_hi_si(void)
+{
+}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+	if (static_key_false(&paravirt_steal_enabled)) {
+		u64 steal;
+		cputime_t steal_ct;
+
+		steal = paravirt_steal_clock(smp_processor_id());
+		steal -= this_rq()->prev_steal_time;
+
+		/*
+		 * cputime_t may be less precise than nsecs (eg: if it's
+		 * based on jiffies). Lets cast the result to cputime
+		 * granularity and account the rest on the next rounds.
+		 */
+		steal_ct = nsecs_to_cputime(steal);
+		this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+
+		account_steal_time(steal_ct);
+		return steal_ct;
+	}
+#endif
+	return false;
+}
+
+/*
+ * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ * tasks (sum on group iteration) belonging to @tsk's group.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+	struct signal_struct *sig = tsk->signal;
+	cputime_t utime, stime;
+	struct task_struct *t;
+	unsigned int seq, nextseq;
+	unsigned long flags;
+
+	rcu_read_lock();
+	/* Attempt a lockless read on the first round. */
+	nextseq = 0;
+	do {
+		seq = nextseq;
+		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+		times->utime = sig->utime;
+		times->stime = sig->stime;
+		times->sum_exec_runtime = sig->sum_sched_runtime;
+
+		for_each_thread(tsk, t) {
+			task_cputime(t, &utime, &stime);
+			times->utime += utime;
+			times->stime += stime;
+			times->sum_exec_runtime += task_sched_runtime(t);
+		}
+		/* If lockless access failed, take the lock. */
+		nextseq = 1;
+	} while (need_seqretry(&sig->stats_lock, seq));
+	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+	rcu_read_unlock();
+}
+
+/*
+ * On each tick, see what percentage of that tick was attributed to each
+ * component and add the percentage to the _pc values. Once a _pc value has
+ * accumulated one tick's worth, account for that. This means the total
+ * percentage of load components will always be 128 (pseudo 100) per tick.
+ */
+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	if (atomic_read(&rq->nr_iowait) > 0) {
+		rq->iowait_pc += pc;
+		if (rq->iowait_pc >= 128) {
+			cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128;
+			rq->iowait_pc %= 128;
+		}
+	} else {
+		rq->idle_pc += pc;
+		if (rq->idle_pc >= 128) {
+			cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128;
+			rq->idle_pc %= 128;
+		}
+	}
+	acct_update_integrals(idle);
+}
+
+static void
+pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset,
+	       unsigned long pc, unsigned long ns)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+
+	p->stime_pc += pc;
+	if (p->stime_pc >= 128) {
+		int jiffs = p->stime_pc / 128;
+
+		p->stime_pc %= 128;
+		p->stime += (__force u64)cputime_one_jiffy * jiffs;
+		p->stimescaled += one_jiffy_scaled * jiffs;
+		account_group_system_time(p, cputime_one_jiffy * jiffs);
+	}
+	p->sched_time += ns;
+	account_group_exec_runtime(p, ns);
+
+	if (hardirq_count() - hardirq_offset) {
+		rq->irq_pc += pc;
+		if (rq->irq_pc >= 128) {
+			cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128;
+			rq->irq_pc %= 128;
+		}
+	} else if (in_serving_softirq()) {
+		rq->softirq_pc += pc;
+		if (rq->softirq_pc >= 128) {
+			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
+			rq->softirq_pc %= 128;
+		}
+	} else {
+		rq->system_pc += pc;
+		if (rq->system_pc >= 128) {
+			cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128;
+			rq->system_pc %= 128;
+		}
+	}
+	acct_update_integrals(p);
+}
+
+static void pc_user_time(struct rq *rq, struct task_struct *p,
+			 unsigned long pc, unsigned long ns)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+
+	p->utime_pc += pc;
+	if (p->utime_pc >= 128) {
+		int jiffs = p->utime_pc / 128;
+
+		p->utime_pc %= 128;
+		p->utime += (__force u64)cputime_one_jiffy * jiffs;
+		p->utimescaled += one_jiffy_scaled * jiffs;
+		account_group_user_time(p, cputime_one_jiffy * jiffs);
+	}
+	p->sched_time += ns;
+	account_group_exec_runtime(p, ns);
+
+	if (this_cpu_ksoftirqd() == p) {
+		/*
+		 * ksoftirqd time do not get accounted in cpu_softirq_time.
+		 * So, we have to handle it separately here.
+		 */
+		rq->softirq_pc += pc;
+		if (rq->softirq_pc >= 128) {
+			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
+			rq->softirq_pc %= 128;
+		}
+	}
+
+	if (task_nice(p) > 0 || idleprio_task(p)) {
+		rq->nice_pc += pc;
+		if (rq->nice_pc >= 128) {
+			cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128;
+			rq->nice_pc %= 128;
+		}
+	} else {
+		rq->user_pc += pc;
+		if (rq->user_pc >= 128) {
+			cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128;
+			rq->user_pc %= 128;
+		}
+	}
+	acct_update_integrals(p);
+}
+
+/*
+ * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast
+ * shifts instead of 100
+ */
+#define NS_TO_PC(NS)	(NS * 128 / JIFFY_NS)
+
+/*
+ * This is called on clock ticks.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void
+update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
+{
+	long account_ns = rq->clock_task - rq->rq_last_ran;
+	struct task_struct *idle = rq->idle;
+	unsigned long account_pc;
+
+	if (unlikely(account_ns < 0) || steal_account_process_tick())
+		goto ts_account;
+
+	account_pc = NS_TO_PC(account_ns);
+
+	/* Accurate tick timekeeping */
+	if (user_mode(get_irq_regs()))
+		pc_user_time(rq, p, account_pc, account_ns);
+	else if (p != idle || (irq_count() != HARDIRQ_OFFSET))
+		pc_system_time(rq, p, HARDIRQ_OFFSET,
+			       account_pc, account_ns);
+	else
+		pc_idle_time(rq, idle, account_pc);
+
+	if (sched_clock_irqtime)
+		irqtime_account_hi_si();
+
+ts_account:
+	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
+	if (rq->rq_policy != SCHED_FIFO && p != idle) {
+		s64 time_diff = rq->clock - rq->timekeep_clock;
+
+		niffy_diff(&time_diff, 1);
+		rq->rq_time_slice -= NS_TO_US(time_diff);
+	}
+
+	rq->rq_last_ran = rq->clock_task;
+	rq->timekeep_clock = rq->clock;
+}
+
+/*
+ * This is called on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void
+update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
+{
+	long account_ns = rq->clock_task - rq->rq_last_ran;
+	struct task_struct *idle = rq->idle;
+	unsigned long account_pc;
+
+	if (unlikely(account_ns < 0))
+		goto ts_account;
+
+	account_pc = NS_TO_PC(account_ns);
+
+	/* Accurate subtick timekeeping */
+	if (p != idle) {
+		pc_user_time(rq, p, account_pc, account_ns);
+	}
+	else
+		pc_idle_time(rq, idle, account_pc);
+
+ts_account:
+	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
+	if (rq->rq_policy != SCHED_FIFO && p != idle) {
+		s64 time_diff = rq->clock - rq->timekeep_clock;
+
+		niffy_diff(&time_diff, 1);
+		rq->rq_time_slice -= NS_TO_US(time_diff);
+	}
+
+	rq->rq_last_ran = rq->clock_task;
+	rq->timekeep_clock = rq->clock;
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been accounted in
+ * @p in case that task is currently running.
+ *
+ * Called with task_grq_lock() held.
+ */
+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+	u64 ns = 0;
+
+	/*
+	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
+	 * project cycles that may never be accounted to this
+	 * thread, breaking clock_gettime().
+	 */
+	if (p == rq->curr && p->on_rq) {
+		update_clocks(rq);
+		ns = rq->clock_task - rq->rq_last_ran;
+		if (unlikely((s64)ns < 0))
+			ns = 0;
+	}
+
+	return ns;
+}
+
+/*
+ * Return accounted runtime for the task.
+ * Return separately the current's pending runtime that have not been
+ * accounted yet.
+ *
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+	u64 ns;
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+	/*
+	 * 64-bit doesn't need locks to atomically read a 64bit value.
+	 * So we have a optimization chance when the task's delta_exec is 0.
+	 * Reading ->on_cpu is racy, but this is ok.
+	 *
+	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
+	 * If we race with it entering cpu, unaccounted time is 0. This is
+	 * indistinguishable from the read occurring a few cycles earlier.
+	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+	 * been accounted, so we're correct here as well.
+	 */
+	if (!p->on_cpu || !p->on_rq)
+		return tsk_seruntime(p);
+#endif
+
+	rq = task_grq_lock(p, &flags);
+	ns = p->sched_time + do_task_delta_exec(p, rq);
+	task_grq_unlock(&flags);
+
+	return ns;
+}
+
+/* Compatibility crap */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+		       cputime_t cputime_scaled)
+{
+}
+
+void account_idle_time(cputime_t cputime)
+{
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+			       cputime_t cputime_scaled)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	/* Add guest time to process. */
+	p->utime += (__force u64)cputime;
+	p->utimescaled += (__force u64)cputime_scaled;
+	account_group_user_time(p, cputime);
+	p->gtime += (__force u64)cputime;
+
+	/* Add guest time to cpustat. */
+	if (task_nice(p) > 0) {
+		cpustat[CPUTIME_NICE] += (__force u64)cputime;
+		cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
+	} else {
+		cpustat[CPUTIME_USER] += (__force u64)cputime;
+		cpustat[CPUTIME_GUEST] += (__force u64)cputime;
+	}
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+			cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+	/* Add system time to process. */
+	p->stime += (__force u64)cputime;
+	p->stimescaled += (__force u64)cputime_scaled;
+	account_group_system_time(p, cputime);
+
+	/* Add system time to cpustat. */
+	*target_cputime64 += (__force u64)cputime;
+
+	/* Account for system time used */
+	acct_update_integrals(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * This is for guest only now.
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+			 cputime_t cputime, cputime_t cputime_scaled)
+{
+
+	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
+		account_guest_time(p, cputime, cputime_scaled);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @steal: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	cpustat[CPUTIME_STEAL] += (__force u64)cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+static void account_idle_times(cputime_t cputime)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	struct rq *rq = this_rq();
+
+	if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
+	else
+		cpustat[CPUTIME_IDLE] += (__force u64)cputime;
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+	account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+	account_idle_times(jiffies_to_cputime(ticks));
+}
+#endif
+
+static inline void grq_iso_lock(void)
+	__acquires(grq.iso_lock)
+{
+	raw_spin_lock(&grq.iso_lock);
+}
+
+static inline void grq_iso_unlock(void)
+	__releases(grq.iso_lock)
+{
+	raw_spin_unlock(&grq.iso_lock);
+}
+
+/*
+ * Functions to test for when SCHED_ISO tasks have used their allocated
+ * quota as real time scheduling and convert them back to SCHED_NORMAL.
+ * Where possible, the data is tested lockless, to avoid grabbing iso_lock
+ * because the occasional inaccurate result won't matter. However the
+ * tick data is only ever modified under lock. iso_refractory is only simply
+ * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
+ */
+static bool set_iso_refractory(void)
+{
+	grq.iso_refractory = true;
+	return grq.iso_refractory;
+}
+
+static bool clear_iso_refractory(void)
+{
+	grq.iso_refractory = false;
+	return grq.iso_refractory;
+}
+
+/*
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
+ * slow division.
+ */
+static bool test_ret_isorefractory(struct rq *rq)
+{
+	if (likely(!grq.iso_refractory)) {
+		if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu)
+			return set_iso_refractory();
+	} else {
+		if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128))
+			return clear_iso_refractory();
+	}
+	return grq.iso_refractory;
+}
+
+static void iso_tick(void)
+{
+	grq_iso_lock();
+	grq.iso_ticks += 100;
+	grq_iso_unlock();
+}
+
+/* No SCHED_ISO task was running so decrease rq->iso_ticks */
+static inline void no_iso_tick(void)
+{
+	if (grq.iso_ticks) {
+		grq_iso_lock();
+		grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
+		if (unlikely(grq.iso_refractory && grq.iso_ticks <
+		    ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
+			clear_iso_refractory();
+		grq_iso_unlock();
+	}
+}
+
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
+static void task_running_tick(struct rq *rq)
+{
+	struct task_struct *p;
+
+	/*
+	 * If a SCHED_ISO task is running we increment the iso_ticks. In
+	 * order to prevent SCHED_ISO tasks from causing starvation in the
+	 * presence of true RT tasks we account those as iso_ticks as well.
+	 */
+	if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) {
+		if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128)
+			iso_tick();
+	} else
+		no_iso_tick();
+
+	if (iso_queue(rq)) {
+		if (unlikely(test_ret_isorefractory(rq))) {
+			if (rq_running_iso(rq)) {
+				/*
+				 * SCHED_ISO task is running as RT and limit
+				 * has been hit. Force it to reschedule as
+				 * SCHED_NORMAL by zeroing its time_slice
+				 */
+				rq->rq_time_slice = 0;
+			}
+		}
+	}
+
+	/* SCHED_FIFO tasks never run out of timeslice. */
+	if (rq->rq_policy == SCHED_FIFO)
+		return;
+	/*
+	 * Tasks that were scheduled in the first half of a tick are not
+	 * allowed to run into the 2nd half of the next tick if they will
+	 * run out of time slice in the interim. Otherwise, if they have
+	 * less than RESCHED_US Î¼s of time slice left they will be rescheduled.
+	 */
+	if (rq->dither) {
+		if (rq->rq_time_slice > HALF_JIFFY_US)
+			return;
+		else
+			rq->rq_time_slice = 0;
+	} else if (rq->rq_time_slice >= RESCHED_US)
+			return;
+
+	/* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
+	p = rq->curr;
+
+	grq_lock();
+	requeue_task(p);
+	__set_tsk_resched(p);
+	grq_unlock();
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled. The data modified is all
+ * local to struct rq so we don't need to grab grq lock.
+ */
+void scheduler_tick(void)
+{
+	int cpu __maybe_unused = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	sched_clock_tick();
+	/* grq lock not grabbed, so only update rq clock */
+	update_rq_clock(rq);
+	update_cpu_clock_tick(rq, rq->curr);
+	if (!rq_idle(rq))
+		task_running_tick(rq);
+	else
+		no_iso_tick();
+	rq->last_tick = rq->clock;
+	perf_event_task_tick();
+}
+
+notrace unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+void preempt_count_add(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+		return;
+#endif
+	__preempt_count_add(val);
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val) {
+		unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+		current->preempt_disable_ip = ip;
+#endif
+		trace_preempt_off(CALLER_ADDR0, ip);
+	}
+}
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
+
+void preempt_count_sub(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+		return;
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+			!(preempt_count() & PREEMPT_MASK)))
+		return;
+#endif
+
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+	__preempt_count_sub(val);
+}
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
+#endif
+
+/*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+	return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+	return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+	return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+	return NS_TO_MS(longest_deadline_diff());
+}
+
+/*
+ * The time_slice is only refilled when it is empty and that is when we set a
+ * new deadline.
+ */
+static void time_slice_expired(struct task_struct *p)
+{
+	p->time_slice = timeslice();
+	p->deadline = grq.niffies + task_deadline_diff(p);
+#ifdef CONFIG_SMT_NICE
+	if (!p->mm)
+		p->smt_bias = 0;
+	else if (rt_task(p))
+		p->smt_bias = 1 << 30;
+	else if (task_running_iso(p))
+		p->smt_bias = 1 << 29;
+	else if (idleprio_task(p)) {
+		if (task_running_idle(p))
+			p->smt_bias = 0;
+		else
+			p->smt_bias = 1;
+	} else if (--p->smt_bias < 1)
+		p->smt_bias = MAX_PRIO - p->static_prio;
+#endif
+}
+
+/*
+ * Timeslices below RESCHED_US are considered as good as expired as there's no
+ * point rescheduling when there's so little time left. SCHED_BATCH tasks
+ * have been flagged be not latency sensitive and likely to be fully CPU
+ * bound so every time they're rescheduled they have their time_slice
+ * refilled, but get a new later deadline to have little effect on
+ * SCHED_NORMAL tasks.
+
+ */
+static inline void check_deadline(struct task_struct *p)
+{
+	if (p->time_slice < RESCHED_US || batch_task(p))
+		time_slice_expired(p);
+}
+
+#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
+
+/*
+ * Scheduler queue bitmap specific find next bit.
+ */
+static inline unsigned long
+next_sched_bit(const unsigned long *addr, unsigned long offset)
+{
+	const unsigned long *p;
+	unsigned long result;
+	unsigned long size;
+	unsigned long tmp;
+
+	size = PRIO_LIMIT;
+	if (offset >= size)
+		return size;
+
+	p = addr + BITOP_WORD(offset);
+	result = offset & ~(BITS_PER_LONG-1);
+	size -= result;
+	offset %= BITS_PER_LONG;
+	if (offset) {
+		tmp = *(p++);
+		tmp &= (~0UL << offset);
+		if (size < BITS_PER_LONG)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= BITS_PER_LONG;
+		result += BITS_PER_LONG;
+	}
+	while (size & ~(BITS_PER_LONG-1)) {
+		if ((tmp = *(p++)))
+			goto found_middle;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp &= (~0UL >> (BITS_PER_LONG - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size;	/* Nope. */
+found_middle:
+	return result + __ffs(tmp);
+}
+
+/*
+ * O(n) lookup of all tasks in the global runqueue. The real brainfuck
+ * of lock contention and O(n). It's not really O(n) as only the queued,
+ * but not running tasks are scanned, and is O(n) queued in the worst case
+ * scenario only because the right task can be found before scanning all of
+ * them.
+ * Tasks are selected in this order:
+ * Real time tasks are selected purely by their static priority and in the
+ * order they were queued, so the lowest value idx, and the first queued task
+ * of that priority value is chosen.
+ * If no real time tasks are found, the SCHED_ISO priority is checked, and
+ * all SCHED_ISO tasks have the same priority value, so they're selected by
+ * the earliest deadline value.
+ * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the
+ * earliest deadline.
+ * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are
+ * selected by the earliest deadline.
+ */
+static inline struct
+task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+	struct task_struct *edt = NULL;
+	unsigned long idx = -1;
+
+	do {
+		struct list_head *queue;
+		struct task_struct *p;
+		u64 earliest_deadline;
+
+		idx = next_sched_bit(grq.prio_bitmap, ++idx);
+		if (idx >= PRIO_LIMIT)
+			return idle;
+		queue = grq.queue + idx;
+
+		if (idx < MAX_RT_PRIO) {
+			/* We found an rt task */
+			list_for_each_entry(p, queue, run_list) {
+				/* Make sure cpu affinity is ok */
+				if (needs_other_cpu(p, cpu))
+					continue;
+				edt = p;
+				goto out_take;
+			}
+			/*
+			 * None of the RT tasks at this priority can run on
+			 * this cpu
+			 */
+			continue;
+		}
+
+		/*
+		 * No rt tasks. Find the earliest deadline task. Now we're in
+		 * O(n) territory.
+		 */
+		earliest_deadline = ~0ULL;
+		list_for_each_entry(p, queue, run_list) {
+			u64 dl;
+
+			/* Make sure cpu affinity is ok */
+			if (needs_other_cpu(p, cpu))
+				continue;
+
+#ifdef CONFIG_SMT_NICE
+			if (!smt_should_schedule(p, cpu))
+				continue;
+#endif
+			/*
+			 * Soft affinity happens here by not scheduling a task
+			 * with its sticky flag set that ran on a different CPU
+			 * last when the CPU is scaling, or by greatly biasing
+			 * against its deadline when not, based on cpu cache
+			 * locality.
+			 */
+			if (sched_interactive)
+				dl = p->deadline;
+			else {
+				int tcpu = task_cpu(p);
+
+				if (tcpu != cpu && task_sticky(p) && scaling_rq(rq))
+					continue;
+				dl = p->deadline << locality_diff(tcpu, rq);
+			}
+
+			if (deadline_before(dl, earliest_deadline)) {
+				earliest_deadline = dl;
+				edt = p;
+			}
+		}
+	} while (!edt);
+
+out_take:
+	take_task(cpu, edt);
+	return edt;
+}
+
+
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
+{
+	if (oops_in_progress)
+		return;
+
+	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+		prev->comm, prev->pid, preempt_count());
+
+	debug_show_held_locks(prev);
+	print_modules();
+	if (irqs_disabled())
+		print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (in_atomic_preempt_off()) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
+	dump_stack();
+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+
+/*
+ * Various schedule()-time debugging checks and statistics:
+ */
+static inline void schedule_debug(struct task_struct *prev)
+{
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+	BUG_ON(task_stack_end_corrupted(prev));
+#endif
+
+	if (unlikely(in_atomic_preempt_off())) {
+		__schedule_bug(prev);
+		preempt_count_set(PREEMPT_DISABLED);
+	}
+	rcu_sleep_check();
+
+	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+	schedstat_inc(this_rq(), sched_count);
+}
+
+/*
+ * The currently running task's information is all stored in rq local data
+ * which is only modified by the local CPU, thereby allowing the data to be
+ * changed without grabbing the grq lock.
+ */
+static inline void set_rq_task(struct rq *rq, struct task_struct *p)
+{
+	rq->rq_time_slice = p->time_slice;
+	rq->rq_deadline = p->deadline;
+	rq->rq_last_ran = p->last_ran = rq->clock_task;
+	rq->rq_policy = p->policy;
+	rq->rq_prio = p->prio;
+#ifdef CONFIG_SMT_NICE
+	rq->rq_mm = p->mm;
+	rq->rq_smt_bias = p->smt_bias;
+#endif
+	if (p != rq->idle)
+		rq->rq_running = true;
+	else
+		rq->rq_running = false;
+}
+
+static void reset_rq_task(struct rq *rq, struct task_struct *p)
+{
+	rq->rq_policy = p->policy;
+	rq->rq_prio = p->prio;
+#ifdef CONFIG_SMT_NICE
+	rq->rq_smt_bias = p->smt_bias;
+#endif
+}
+
+#ifdef CONFIG_SMT_NICE
+/* Iterate over smt siblings when we've scheduled a process on cpu and decide
+ * whether they should continue running or be descheduled. */
+static void check_smt_siblings(int cpu)
+{
+	int other_cpu;
+
+	for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+		struct task_struct *p;
+		struct rq *rq;
+
+		if (other_cpu == cpu)
+			continue;
+		rq = cpu_rq(other_cpu);
+		if (rq_idle(rq))
+			continue;
+		if (!rq->online)
+			continue;
+		p = rq->curr;
+		if (!smt_should_schedule(p, cpu)) {
+			set_tsk_need_resched(p);
+			smp_send_reschedule(other_cpu);
+		}
+	}
+}
+
+static void wake_smt_siblings(int cpu)
+{
+	int other_cpu;
+
+	if (!queued_notrunning())
+		return;
+
+	for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+		struct rq *rq;
+
+		if (other_cpu == cpu)
+			continue;
+		rq = cpu_rq(other_cpu);
+		if (rq_idle(rq)) {
+			struct task_struct *p = rq->curr;
+
+			set_tsk_need_resched(p);
+			smp_send_reschedule(other_cpu);
+		}
+	}
+}
+#else
+static void check_smt_siblings(int __maybe_unused cpu) {}
+static void wake_smt_siblings(int __maybe_unused cpu) {}
+#endif
+
+/*
+ * schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ *      paths. For example, see arch/x86/entry_64.S.
+ *
+ *      To drive preemption between tasks, the scheduler sets the flag in timer
+ *      interrupt handler scheduler_tick().
+ *
+ *   3. Wakeups don't really cause entry into schedule(). They add a
+ *      task to the run-queue and that's it.
+ *
+ *      Now, if the new task added to the run-queue preempts the current
+ *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ *      called on the nearest possible occasion:
+ *
+ *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ *         - in syscall or exception context, at the next outmost
+ *           preempt_enable(). (this might be as soon as the wake_up()'s
+ *           spin_unlock()!)
+ *
+ *         - in IRQ context, return from interrupt-handler to
+ *           preemptible context
+ *
+ *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ *         then at the next:
+ *
+ *          - cond_resched() call
+ *          - explicit schedule() call
+ *          - return from syscall or exception to user-space
+ *          - return from interrupt-handler to user-space
+ *
+ * WARNING: must be called with preemption disabled!
+ */
+static void __sched notrace __schedule(bool preempt)
+{
+	struct task_struct *prev, *next, *idle;
+	unsigned long *switch_count;
+	bool deactivate = false;
+	struct rq *rq;
+	int cpu;
+
+	cpu = smp_processor_id();
+	rq = cpu_rq(cpu);
+	prev = rq->curr;
+
+	/*
+	 * do_exit() calls schedule() with preemption disabled as an exception;
+	 * however we must fix that up, otherwise the next task will see an
+	 * inconsistent (higher) preempt count.
+	 *
+	 * It also avoids the below schedule_debug() test from complaining
+	 * about this.
+	 */
+	if (unlikely(prev->state == TASK_DEAD))
+		preempt_enable_no_resched_notrace();
+
+	schedule_debug(prev);
+
+	local_irq_disable();
+	rcu_note_context_switch();
+
+	/*
+	 * Make sure that signal_pending_state()->signal_pending() below
+	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+	 * done by the caller to avoid the race with signal_wake_up().
+	 */
+	smp_mb__before_spinlock();
+	grq_lock();
+
+	switch_count = &prev->nivcsw;
+	if (!preempt && prev->state) {
+		if (unlikely(signal_pending_state(prev->state, prev))) {
+			prev->state = TASK_RUNNING;
+		} else {
+			deactivate = true;
+			prev->on_rq = 0;
+
+			/*
+			 * If a worker is going to sleep, notify and
+			 * ask workqueue whether it wants to wake up a
+			 * task to maintain concurrency.  If so, wake
+			 * up the task.
+			 */
+			if (prev->flags & PF_WQ_WORKER) {
+				struct task_struct *to_wakeup;
+
+				to_wakeup = wq_worker_sleeping(prev, cpu);
+				if (to_wakeup) {
+					/* This shouldn't happen, but does */
+					if (unlikely(to_wakeup == prev))
+						deactivate = false;
+					else
+						try_to_wake_up_local(to_wakeup);
+				}
+			}
+		}
+		switch_count = &prev->nvcsw;
+	}
+
+	update_clocks(rq);
+	update_cpu_clock_switch(rq, prev);
+	if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
+		rq->dither = false;
+	else
+		rq->dither = true;
+
+	clear_tsk_need_resched(prev);
+	clear_preempt_need_resched();
+
+	idle = rq->idle;
+	if (idle != prev) {
+		/* Update all the information stored on struct rq */
+		prev->time_slice = rq->rq_time_slice;
+		prev->deadline = rq->rq_deadline;
+		check_deadline(prev);
+		prev->last_ran = rq->clock_task;
+
+		/* Task changed affinity off this CPU */
+		if (likely(!needs_other_cpu(prev, cpu))) {
+			if (!deactivate) {
+				if (!queued_notrunning()) {
+					/*
+					 * We now know prev is the only thing that is
+					 * awaiting CPU so we can bypass rechecking for
+					 * the earliest deadline task and just run it
+					 * again.
+					 */
+					set_rq_task(rq, prev);
+					check_smt_siblings(cpu);
+					grq_unlock_irq();
+					goto rerun_prev_unlocked;
+				} else
+					swap_sticky(rq, cpu, prev);
+			}
+		}
+		return_task(prev, rq, deactivate);
+	}
+
+	if (unlikely(!queued_notrunning())) {
+		/*
+		 * This CPU is now truly idle as opposed to when idle is
+		 * scheduled as a high priority task in its own right.
+		 */
+		next = idle;
+		schedstat_inc(rq, sched_goidle);
+		set_cpuidle_map(cpu);
+	} else {
+		next = earliest_deadline_task(rq, cpu, idle);
+		if (likely(next->prio != PRIO_LIMIT))
+			clear_cpuidle_map(cpu);
+		else
+			set_cpuidle_map(cpu);
+	}
+
+	if (likely(prev != next)) {
+		/*
+		 * Don't reschedule an idle task or deactivated tasks
+		 */
+		if (prev != idle && !deactivate)
+			resched_suitable_idle(prev);
+		/*
+		 * Don't stick tasks when a real time task is going to run as
+		 * they may literally get stuck.
+		 */
+		if (rt_task(next))
+			unstick_task(rq, prev);
+		set_rq_task(rq, next);
+		if (next != idle)
+			check_smt_siblings(cpu);
+		else
+			wake_smt_siblings(cpu);
+		grq.nr_switches++;
+		prev->on_cpu = false;
+		next->on_cpu = true;
+		rq->curr = next;
+		++*switch_count;
+
+		trace_sched_switch(preempt, prev, next);
+		rq = context_switch(rq, prev, next); /* unlocks the grq */
+		cpu = cpu_of(rq);
+		idle = rq->idle;
+	} else {
+		check_smt_siblings(cpu);
+		grq_unlock_irq();
+	}
+
+rerun_prev_unlocked:
+	return;
+}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+	if (!tsk->state || tsk_is_pi_blocked(tsk) ||
+	    preempt_count() ||
+	    signal_pending_state(tsk->state, tsk))
+		return;
+
+	/*
+	 * If we are going to sleep and we have plugged IO queued,
+	 * make sure to submit it to avoid deadlocks.
+	 */
+	if (blk_needs_flush_plug(tsk))
+		blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+	struct task_struct *tsk = current;
+
+	sched_submit_work(tsk);
+	do {
+		preempt_disable();
+		__schedule(false);
+		sched_preempt_enable_no_resched();
+	} while (need_resched());
+}
+
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+asmlinkage __visible void __sched schedule_user(void)
+{
+	/*
+	 * If we come here after a random call to set_need_resched(),
+	 * or we have been woken up remotely but the IPI has not yet arrived,
+	 * we haven't yet exited the RCU idle mode. Do it here manually until
+	 * we find a better solution.
+	 *
+	 * NB: There are buggy callers of this function.  Ideally we
+	 * should warn if prev_state != IN_USER, but that will trigger
+	 * too frequently to make sense yet.
+	 */
+	enum ctx_state prev_state = exception_enter();
+	schedule();
+	exception_exit(prev_state);
+}
+#endif
+
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+	sched_preempt_enable_no_resched();
+	schedule();
+	preempt_disable();
+}
+
+static void __sched notrace preempt_schedule_common(void)
+{
+	do {
+		preempt_disable_notrace();
+		__schedule(true);
+		preempt_enable_no_resched_notrace();
+
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+	} while (need_resched());
+}
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule(void)
+{
+	/*
+	 * If there is a non-zero preempt_count or interrupts are disabled,
+	 * we do not want to preempt the current task. Just return..
+	 */
+	if (likely(!preemptible()))
+		return;
+
+	preempt_schedule_common();
+}
+NOKPROBE_SYMBOL(preempt_schedule);
+EXPORT_SYMBOL(preempt_schedule);
+
+/**
+ * preempt_schedule_notrace - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
+{
+	enum ctx_state prev_ctx;
+
+	if (likely(!preemptible()))
+		return;
+
+	do {
+		preempt_disable_notrace();
+		/*
+		 * Needs preempt disabled in case user_exit() is traced
+		 * and the tracer calls preempt_enable_notrace() causing
+		 * an infinite recursion.
+		 */
+		prev_ctx = exception_enter();
+		__schedule(true);
+		exception_exit(prev_ctx);
+
+		preempt_enable_no_resched_notrace();
+	} while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage __visible void __sched preempt_schedule_irq(void)
+{
+	enum ctx_state prev_state;
+
+	/* Catch callers which need to be fixed */
+	BUG_ON(preempt_count() || !irqs_disabled());
+
+	prev_state = exception_enter();
+
+	do {
+		preempt_disable();
+		local_irq_enable();
+		__schedule(true);
+		local_irq_disable();
+		sched_preempt_enable_no_resched();
+	} while (need_resched());
+
+	exception_exit(prev_state);
+}
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+			  void *key)
+{
+	return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+	unsigned long flags;
+	int queued, oldprio;
+	struct rq *rq;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = task_grq_lock(p, &flags);
+
+	/*
+	 * Idle task boosting is a nono in general. There is one
+	 * exception, when PREEMPT_RT and NOHZ is active:
+	 *
+	 * The idle task calls get_next_timer_interrupt() and holds
+	 * the timer wheel base->lock on the CPU and another CPU wants
+	 * to access the timer (probably to cancel it). We can safely
+	 * ignore the boosting request, as the idle CPU runs this code
+	 * with interrupts disabled and will complete the lock
+	 * protected section without being interrupted. So there is no
+	 * real need to boost.
+	 */
+	if (unlikely(p == rq->idle)) {
+		WARN_ON(p != rq->curr);
+		WARN_ON(p->pi_blocked_on);
+		goto out_unlock;
+	}
+
+	trace_sched_pi_setprio(p, prio);
+	oldprio = p->prio;
+	queued = task_queued(p);
+	if (queued)
+		dequeue_task(p);
+	p->prio = prio;
+	if (task_running(p) && prio > oldprio)
+		resched_task(p);
+	if (queued) {
+		enqueue_task(p, rq);
+		try_preempt(p, rq);
+	}
+
+out_unlock:
+	task_grq_unlock(&flags);
+}
+
+#endif
+
+/*
+ * Adjust the deadline for when the priority is to change, before it's
+ * changed.
+ */
+static inline void adjust_deadline(struct task_struct *p, int new_prio)
+{
+	p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
+}
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+	int queued, new_static, old_static;
+	unsigned long flags;
+	struct rq *rq;
+
+	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+		return;
+	new_static = NICE_TO_PRIO(nice);
+	/*
+	 * We have to be careful, if called from sys_setpriority(),
+	 * the task might be in the middle of scheduling on another CPU.
+	 */
+	rq = time_task_grq_lock(p, &flags);
+	/*
+	 * The RT priorities are set via sched_setscheduler(), but we still
+	 * allow the 'normal' nice value to be set - but as expected
+	 * it wont have any effect on scheduling until the task is
+	 * not SCHED_NORMAL/SCHED_BATCH:
+	 */
+	if (has_rt_policy(p)) {
+		p->static_prio = new_static;
+		goto out_unlock;
+	}
+	queued = task_queued(p);
+	if (queued)
+		dequeue_task(p);
+
+	adjust_deadline(p, new_static);
+	old_static = p->static_prio;
+	p->static_prio = new_static;
+	p->prio = effective_prio(p);
+
+	if (queued) {
+		enqueue_task(p, rq);
+		if (new_static < old_static)
+			try_preempt(p, rq);
+	} else if (task_running(p)) {
+		reset_rq_task(rq, p);
+		if (old_static < new_static)
+			resched_task(p);
+	}
+out_unlock:
+	task_grq_unlock(&flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+	/* convert nice value [19,-20] to rlimit style value [1,40] */
+	int nice_rlim = nice_to_rlimit(nice);
+
+	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+		capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+	long nice, retval;
+
+	/*
+	 * Setpriority might change our priority at the same moment.
+	 * We don't have to worry. Conceptually one call occurs first
+	 * and we have a single winner.
+	 */
+
+	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+	nice = task_nice(current) + increment;
+
+	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+	if (increment < 0 && !can_nice(current, nice))
+		return -EPERM;
+
+	retval = security_task_setnice(current, nice);
+	if (retval)
+		return retval;
+
+	set_user_nice(current, nice);
+	return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
+ */
+int task_prio(const struct task_struct *p)
+{
+	int delta, prio = p->prio - MAX_RT_PRIO;
+
+	/* rt tasks and iso tasks */
+	if (prio <= 0)
+		goto out;
+
+	/* Convert to ms to avoid overflows */
+	delta = NS_TO_MS(p->deadline - grq.niffies);
+	delta = delta * 40 / ms_longest_deadline_diff();
+	if (delta > 0 && delta <= 80)
+		prio += delta;
+	if (idleprio_task(p))
+		prio += 40;
+out:
+	return prio;
+}
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+	return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+	return pid ? find_task_by_vpid(pid) : current;
+}
+
+/* Actually do priority change: must hold grq lock. */
+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
+			   int prio, bool keep_boost)
+{
+	int oldrtprio, oldprio;
+
+	p->policy = policy;
+	oldrtprio = p->rt_priority;
+	p->rt_priority = prio;
+	p->normal_prio = normal_prio(p);
+	oldprio = p->prio;
+	/*
+	 * Keep a potential priority boosting if called from
+	 * sched_setscheduler().
+	 */
+	if (keep_boost) {
+		/*
+		 * Take priority boosted tasks into account. If the new
+		 * effective priority is unchanged, we just store the new
+		 * normal parameters and do not touch the scheduler class and
+		 * the runqueue. This will be done when the task deboost
+		 * itself.
+		 */
+		p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
+	} else
+		p->prio = p->normal_prio;
+	if (task_running(p)) {
+		reset_rq_task(rq, p);
+		/* Resched only if we might now be preempted */
+		if (p->prio > oldprio || p->rt_priority > oldrtprio)
+			resched_task(p);
+	}
+}
+
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+	const struct cred *cred = current_cred(), *pcred;
+	bool match;
+
+	rcu_read_lock();
+	pcred = __task_cred(p);
+	match = (uid_eq(cred->euid, pcred->euid) ||
+		 uid_eq(cred->euid, pcred->uid));
+	rcu_read_unlock();
+	return match;
+}
+
+static int
+__sched_setscheduler(struct task_struct *p, int policy,
+		     const struct sched_param *param, bool user, bool pi)
+{
+	struct sched_param zero_param = { .sched_priority = 0 };
+	int queued, retval, oldpolicy = -1;
+	unsigned long flags, rlim_rtprio = 0;
+	int reset_on_fork;
+	struct rq *rq;
+
+	/* may grab non-irq protected spin_locks */
+	BUG_ON(in_interrupt());
+
+	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
+		unsigned long lflags;
+
+		if (!lock_task_sighand(p, &lflags))
+			return -ESRCH;
+		rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+		unlock_task_sighand(p, &lflags);
+		if (rlim_rtprio)
+			goto recheck;
+		/*
+		 * If the caller requested an RT policy without having the
+		 * necessary rights, we downgrade the policy to SCHED_ISO.
+		 * We also set the parameter to zero to pass the checks.
+		 */
+		policy = SCHED_ISO;
+		param = &zero_param;
+	}
+recheck:
+	/* double check policy once rq lock held */
+	if (policy < 0) {
+		reset_on_fork = p->sched_reset_on_fork;
+		policy = oldpolicy = p->policy;
+	} else {
+		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+		policy &= ~SCHED_RESET_ON_FORK;
+
+		if (!SCHED_RANGE(policy))
+			return -EINVAL;
+	}
+
+	/*
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+	 * SCHED_BATCH is 0.
+	 */
+	if (param->sched_priority < 0 ||
+	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
+	    (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
+		return -EINVAL;
+	if (is_rt_policy(policy) != (param->sched_priority != 0))
+		return -EINVAL;
+
+	/*
+	 * Allow unprivileged RT tasks to decrease priority:
+	 */
+	if (user && !capable(CAP_SYS_NICE)) {
+		if (is_rt_policy(policy)) {
+			unsigned long rlim_rtprio =
+					task_rlimit(p, RLIMIT_RTPRIO);
+
+			/* can't set/change the rt policy */
+			if (policy != p->policy && !rlim_rtprio)
+				return -EPERM;
+
+			/* can't increase priority */
+			if (param->sched_priority > p->rt_priority &&
+			    param->sched_priority > rlim_rtprio)
+				return -EPERM;
+		} else {
+			switch (p->policy) {
+				/*
+				 * Can only downgrade policies but not back to
+				 * SCHED_NORMAL
+				 */
+				case SCHED_ISO:
+					if (policy == SCHED_ISO)
+						goto out;
+					if (policy == SCHED_NORMAL)
+						return -EPERM;
+					break;
+				case SCHED_BATCH:
+					if (policy == SCHED_BATCH)
+						goto out;
+					if (policy != SCHED_IDLEPRIO)
+						return -EPERM;
+					break;
+				case SCHED_IDLEPRIO:
+					if (policy == SCHED_IDLEPRIO)
+						goto out;
+					return -EPERM;
+				default:
+					break;
+			}
+		}
+
+		/* can't change other user's priorities */
+		if (!check_same_owner(p))
+			return -EPERM;
+
+		/* Normal users shall not reset the sched_reset_on_fork flag */
+		if (p->sched_reset_on_fork && !reset_on_fork)
+			return -EPERM;
+	}
+
+	if (user) {
+		retval = security_task_setscheduler(p);
+		if (retval)
+			return retval;
+	}
+
+	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 */
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	/*
+	 * To be able to change p->policy safely, the grunqueue lock must be
+	 * held.
+	 */
+	rq = __task_grq_lock(p);
+
+	/*
+	 * Changing the policy of the stop threads its a very bad idea
+	 */
+	if (p == rq->stop) {
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * If not changing anything there's no need to proceed further:
+	 */
+	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
+			param->sched_priority == p->rt_priority))) {
+
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		return 0;
+	}
+
+	/* recheck policy now with rq lock held */
+	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+		policy = oldpolicy = -1;
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		goto recheck;
+	}
+	update_clocks(rq);
+	p->sched_reset_on_fork = reset_on_fork;
+
+	queued = task_queued(p);
+	if (queued)
+		dequeue_task(p);
+	__setscheduler(p, rq, policy, param->sched_priority, pi);
+	if (queued) {
+		enqueue_task(p, rq);
+		try_preempt(p, rq);
+	}
+	__task_grq_unlock();
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	if (pi)
+		rt_mutex_adjust_pi(p);
+out:
+	return 0;
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+		       const struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, true, true);
+}
+
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+	const struct sched_param param = { .sched_priority = attr->sched_priority };
+	int policy = attr->sched_policy;
+
+	return __sched_setscheduler(p, policy, &param, true, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+			       const struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, false, true);
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+	struct sched_param lparam;
+	struct task_struct *p;
+	int retval;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+		return -EFAULT;
+
+	rcu_read_lock();
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (p != NULL)
+		retval = sched_setscheduler(p, policy, &lparam);
+	rcu_read_unlock();
+
+	return retval;
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+			   struct sched_attr *attr)
+{
+	u32 size;
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+		return -EFAULT;
+
+	/*
+	 * zero the full structure, so that a short copy will be nice.
+	 */
+	memset(attr, 0, sizeof(*attr));
+
+	ret = get_user(size, &uattr->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)	/* silly large */
+		goto err_size;
+
+	if (!size)		/* abi compat */
+		size = SCHED_ATTR_SIZE_VER0;
+
+	if (size < SCHED_ATTR_SIZE_VER0)
+		goto err_size;
+
+	/*
+	 * If we're handed a bigger struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. new
+	 * user-space does not rely on any kernel feature
+	 * extensions we dont know about yet.
+	 */
+	if (size > sizeof(*attr)) {
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
+
+		addr = (void __user *)uattr + sizeof(*attr);
+		end  = (void __user *)uattr + size;
+
+		for (; addr < end; addr++) {
+			ret = get_user(val, addr);
+			if (ret)
+				return ret;
+			if (val)
+				goto err_size;
+		}
+		size = sizeof(*attr);
+	}
+
+	ret = copy_from_user(attr, uattr, size);
+	if (ret)
+		return -EFAULT;
+
+	/*
+	 * XXX: do we want to be lenient like existing syscalls; or do we want
+	 * to be strict and return an error on out-of-bounds values?
+	 */
+	attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+
+	/* sched/core.c uses zero here but we already know ret is zero */
+	return 0;
+
+err_size:
+	put_user(sizeof(*attr), &uattr->size);
+	return -E2BIG;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ *
+ * Return: 0 on success. An error code otherwise.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+				       struct sched_param __user *param)
+{
+	/* negative values for policy are not valid */
+	if (policy < 0)
+		return -EINVAL;
+
+	return do_sched_setscheduler(pid, policy, param);
+}
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY	-1
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+			       unsigned int, flags)
+{
+	struct sched_attr attr;
+	struct task_struct *p;
+	int retval;
+
+	if (!uattr || pid < 0 || flags)
+		return -EINVAL;
+
+	retval = sched_copy_attr(uattr, &attr);
+	if (retval)
+		return retval;
+
+	if ((int)attr.sched_policy < 0)
+		return -EINVAL;
+
+	rcu_read_lock();
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (p != NULL)
+		retval = sched_setattr(p, &attr);
+	rcu_read_unlock();
+
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	if (p) {
+		retval = security_task_getscheduler(p);
+		if (!retval)
+			retval = p->policy;
+	}
+	rcu_read_unlock();
+
+out_nounlock:
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+	struct sched_param lp = { .sched_priority = 0 };
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (!param || pid < 0)
+		goto out_nounlock;
+
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	if (has_rt_policy(p))
+		lp.sched_priority = p->rt_priority;
+	rcu_read_unlock();
+
+	/*
+	 * This one might sleep, we cannot do it with a spinlock held ...
+	 */
+	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+static int sched_read_attr(struct sched_attr __user *uattr,
+			   struct sched_attr *attr,
+			   unsigned int usize)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, uattr, usize))
+		return -EFAULT;
+
+	/*
+	 * If we're handed a smaller struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. old
+	 * user-space does not get uncomplete information.
+	 */
+	if (usize < sizeof(*attr)) {
+		unsigned char *addr;
+		unsigned char *end;
+
+		addr = (void *)attr + usize;
+		end  = (void *)attr + sizeof(*attr);
+
+		for (; addr < end; addr++) {
+			if (*addr)
+				return -EFBIG;
+		}
+
+		attr->size = usize;
+	}
+
+	ret = copy_to_user(uattr, attr, attr->size);
+	if (ret)
+		return -EFAULT;
+
+	/* sched/core.c uses zero here but we already know ret is zero */
+	return ret;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+		unsigned int, size, unsigned int, flags)
+{
+	struct sched_attr attr = {
+		.size = sizeof(struct sched_attr),
+	};
+	struct task_struct *p;
+	int retval;
+
+	if (!uattr || pid < 0 || size > PAGE_SIZE ||
+	    size < SCHED_ATTR_SIZE_VER0 || flags)
+		return -EINVAL;
+
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	attr.sched_policy = p->policy;
+	if (rt_task(p))
+		attr.sched_priority = p->rt_priority;
+	else
+		attr.sched_nice = task_nice(p);
+
+	rcu_read_unlock();
+
+	retval = sched_read_attr(uattr, &attr, size);
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+	cpumask_var_t cpus_allowed, new_mask;
+	struct task_struct *p;
+	int retval;
+
+	get_online_cpus();
+	rcu_read_lock();
+
+	p = find_process_by_pid(pid);
+	if (!p) {
+		rcu_read_unlock();
+		put_online_cpus();
+		return -ESRCH;
+	}
+
+	/* Prevent p going away */
+	get_task_struct(p);
+	rcu_read_unlock();
+
+	if (p->flags & PF_NO_SETAFFINITY) {
+		retval = -EINVAL;
+		goto out_put_task;
+	}
+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_put_task;
+	}
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_free_cpus_allowed;
+	}
+	retval = -EPERM;
+	if (!check_same_owner(p)) {
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+			rcu_read_unlock();
+			goto out_unlock;
+		}
+		rcu_read_unlock();
+	}
+
+	retval = security_task_setscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	cpuset_cpus_allowed(p, cpus_allowed);
+	cpumask_and(new_mask, in_mask, cpus_allowed);
+again:
+	retval = __set_cpus_allowed_ptr(p, new_mask, true);
+
+	if (!retval) {
+		cpuset_cpus_allowed(p, cpus_allowed);
+		if (!cpumask_subset(new_mask, cpus_allowed)) {
+			/*
+			 * We must have raced with a concurrent cpuset
+			 * update. Just reset the cpus_allowed to the
+			 * cpuset's cpus_allowed
+			 */
+			cpumask_copy(new_mask, cpus_allowed);
+			goto again;
+		}
+	}
+out_unlock:
+	free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+	free_cpumask_var(cpus_allowed);
+out_put_task:
+	put_task_struct(p);
+	put_online_cpus();
+	return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+			     cpumask_t *new_mask)
+{
+	if (len < sizeof(cpumask_t)) {
+		memset(new_mask, 0, sizeof(cpumask_t));
+	} else if (len > sizeof(cpumask_t)) {
+		len = sizeof(cpumask_t);
+	}
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	cpumask_var_t new_mask;
+	int retval;
+
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+	if (retval == 0)
+		retval = sched_setaffinity(pid, new_mask);
+	free_cpumask_var(new_mask);
+	return retval;
+}
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+	struct task_struct *p;
+	unsigned long flags;
+	int retval;
+
+	get_online_cpus();
+	rcu_read_lock();
+
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	grq_lock_irqsave(&flags);
+	cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
+	grq_unlock_irqrestore(&flags);
+
+out_unlock:
+	rcu_read_unlock();
+	put_online_cpus();
+
+	return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	int ret;
+	cpumask_var_t mask;
+
+	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+		return -EINVAL;
+	if (len & (sizeof(unsigned long)-1))
+		return -EINVAL;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = sched_getaffinity(pid, mask);
+	if (ret == 0) {
+		size_t retlen = min_t(size_t, len, cpumask_size());
+
+		if (copy_to_user(user_mask_ptr, mask, retlen))
+			ret = -EFAULT;
+		else
+			ret = retlen;
+	}
+	free_cpumask_var(mask);
+
+	return ret;
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. It does this by
+ * scheduling away the current task. If it still has the earliest deadline
+ * it will be scheduled again as the next task.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+	struct task_struct *p;
+
+	p = current;
+	grq_lock_irq();
+	schedstat_inc(task_rq(p), yld_count);
+	requeue_task(p);
+
+	/*
+	 * Since we are going to call schedule() anyway, there's
+	 * no need to preempt or enable interrupts:
+	 */
+	__release(grq.lock);
+	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+	do_raw_spin_unlock(&grq.lock);
+	sched_preempt_enable_no_resched();
+
+	schedule();
+
+	return 0;
+}
+
+int __sched _cond_resched(void)
+{
+	if (should_resched(0)) {
+		preempt_schedule_common();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+	int resched = should_resched(PREEMPT_LOCK_OFFSET);
+	int ret = 0;
+
+	lockdep_assert_held(lock);
+
+	if (spin_needbreak(lock) || resched) {
+		spin_unlock(lock);
+		if (resched)
+			preempt_schedule_common();
+		else
+			cpu_relax();
+		ret = 1;
+		spin_lock(lock);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+int __sched __cond_resched_softirq(void)
+{
+	BUG_ON(!in_softirq());
+
+	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
+		local_bh_enable();
+		preempt_schedule_common();
+		local_bh_disable();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * 	yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+	set_current_state(TASK_RUNNING);
+	sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ *	true (>0) if we indeed boosted the target task.
+ *	false (0) if we failed to boost the target.
+ *	-ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+	struct rq *rq, *p_rq;
+	unsigned long flags;
+	int yielded = 0;
+
+	rq = this_rq();
+	grq_lock_irqsave(&flags);
+	if (task_running(p) || p->state) {
+		yielded = -ESRCH;
+		goto out_unlock;
+	}
+
+	p_rq = task_rq(p);
+	yielded = 1;
+	if (p->deadline > rq->rq_deadline)
+		p->deadline = rq->rq_deadline;
+	p->time_slice += rq->rq_time_slice;
+	rq->rq_time_slice = 0;
+	if (p->time_slice > timeslice())
+		p->time_slice = timeslice();
+	if (preempt && rq != p_rq)
+		resched_curr(p_rq);
+out_unlock:
+	grq_unlock_irqrestore(&flags);
+
+	if (yielded > 0)
+		schedule();
+	return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/*
+ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+
+long __sched io_schedule_timeout(long timeout)
+{
+	int old_iowait = current->in_iowait;
+	struct rq *rq;
+	long ret;
+
+	current->in_iowait = 1;
+	blk_schedule_flush_plug(current);
+
+	delayacct_blkio_start();
+	rq = raw_rq();
+	atomic_inc(&rq->nr_iowait);
+	ret = schedule_timeout(timeout);
+	current->in_iowait = old_iowait;
+	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
+
+	return ret;
+}
+EXPORT_SYMBOL(io_schedule_timeout);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = MAX_USER_RT_PRIO-1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+	case SCHED_ISO:
+	case SCHED_IDLEPRIO:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = 1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+	case SCHED_ISO:
+	case SCHED_IDLEPRIO:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
+{
+	struct task_struct *p;
+	unsigned int time_slice;
+	unsigned long flags;
+	int retval;
+	struct timespec t;
+
+	if (pid < 0)
+		return -EINVAL;
+
+	retval = -ESRCH;
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	grq_lock_irqsave(&flags);
+	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
+	grq_unlock_irqrestore(&flags);
+
+	rcu_read_unlock();
+	t = ns_to_timespec(time_slice);
+	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
+void sched_show_task(struct task_struct *p)
+{
+	unsigned long free = 0;
+	int ppid;
+	unsigned long state = p->state;
+
+	if (state)
+		state = __ffs(state) + 1;
+	printk(KERN_INFO "%-15.15s %c", p->comm,
+		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+#if BITS_PER_LONG == 32
+	if (state == TASK_RUNNING)
+		printk(KERN_CONT " running  ");
+	else
+		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+#else
+	if (state == TASK_RUNNING)
+		printk(KERN_CONT "  running task    ");
+	else
+		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+	free = stack_not_used(p);
+#endif
+	ppid = 0;
+	rcu_read_lock();
+	if (pid_alive(p))
+		ppid = task_pid_nr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+		task_pid_nr(p), ppid,
+		(unsigned long)task_thread_info(p)->flags);
+
+	print_worker_info(KERN_INFO, p);
+	show_stack(p, NULL);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+	struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+	printk(KERN_INFO
+		"  task                PC stack   pid father\n");
+#else
+	printk(KERN_INFO
+		"  task                        PC stack   pid father\n");
+#endif
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		/*
+		 * reset the NMI-timeout, listing all files on a slow
+		 * console might take a lot of time:
+		 */
+		touch_nmi_watchdog();
+		if (!state_filter || (p->state & state_filter))
+			sched_show_task(p);
+	}
+
+	touch_all_softlockup_watchdogs();
+
+	rcu_read_unlock();
+	/*
+	 * Only show locks if all tasks are dumped:
+	 */
+	if (!state_filter)
+		debug_show_all_locks();
+}
+
+void dump_cpu_task(int cpu)
+{
+	pr_info("Task dump for CPU %d:\n", cpu);
+	sched_show_task(cpu_curr(cpu));
+}
+
+#ifdef CONFIG_SMP
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+{
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	cpumask_copy(tsk_cpus_allowed(p), new_mask);
+}
+#endif
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void init_idle(struct task_struct *idle, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&idle->pi_lock, flags);
+	time_lock_grq(rq);
+	idle->last_ran = rq->clock_task;
+	idle->state = TASK_RUNNING;
+	/* Setting prio to illegal value shouldn't matter when never queued */
+	idle->prio = PRIO_LIMIT;
+
+	kasan_unpoison_task_stack(idle);
+
+#ifdef CONFIG_SMP
+	/*
+	 * It's possible that init_idle() gets called multiple times on a task,
+	 * in that case do_set_cpus_allowed() will not do the right thing.
+	 *
+	 * And since this is boot we can forgo the serialisation.
+	 */
+	set_cpus_allowed_common(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMT_NICE
+	idle->smt_bias = 0;
+#endif
+#endif
+	set_rq_task(rq, idle);
+
+	/* Silence PROVE_RCU */
+	rcu_read_lock();
+	set_task_cpu(idle, cpu);
+	rcu_read_unlock();
+
+	rq->curr = rq->idle = idle;
+	idle->on_cpu = 1;
+	grq_unlock();
+	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
+
+	/* Set the preempt count _outside_ the spinlocks! */
+	init_idle_preempt_count(idle, cpu);
+
+	ftrace_graph_init_idle_task(idle, cpu);
+#ifdef CONFIG_SMP
+	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,
+			      const struct cpumask __maybe_unused *trial)
+{
+	return 1;
+}
+
+int task_can_attach(struct task_struct *p,
+		    const struct cpumask *cs_cpus_allowed)
+{
+	int ret = 0;
+
+	/*
+	 * Kthreads which disallow setaffinity shouldn't be moved
+	 * to a new cpuset; we don't want to change their cpu
+	 * affinity and isolating such threads by their set of
+	 * allowed nodes is unnecessary.  Thus, cpusets are not
+	 * applicable for such threads.  This prevents checking for
+	 * success of set_cpus_allowed_ptr() on all attached tasks
+	 * before cpus_allowed may be changed.
+	 */
+	if (p->flags & PF_NO_SETAFFINITY)
+		ret = -EINVAL;
+
+	return ret;
+}
+
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+	struct wake_q_node *node = &task->wake_q;
+
+	/*
+	 * Atomically grab the task, if ->wake_q is !nil already it means
+	 * its already queued (either by us or someone else) and will get the
+	 * wakeup due to that.
+	 *
+	 * This cmpxchg() implies a full barrier, which pairs with the write
+	 * barrier implied by the wakeup in wake_up_list().
+	 */
+	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+		return;
+
+	get_task_struct(task);
+
+	/*
+	 * The head is context local, there can be no concurrency.
+	 */
+	*head->lastp = node;
+	head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+	struct wake_q_node *node = head->first;
+
+	while (node != WAKE_Q_TAIL) {
+		struct task_struct *task;
+
+		task = container_of(node, struct task_struct, wake_q);
+		BUG_ON(!task);
+		/* task can safely be re-inserted now */
+		node = node->next;
+		task->wake_q.next = NULL;
+
+		/*
+		 * wake_up_process() implies a wmb() to pair with the queueing
+		 * in wake_q_add() so as not to miss wakeups.
+		 */
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+}
+
+void resched_cpu(int cpu)
+{
+	unsigned long flags;
+
+	grq_lock_irqsave(&flags);
+	resched_task(cpu_curr(cpu));
+	grq_unlock_irqrestore(&flags);
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
+void nohz_balance_enter_idle(int cpu)
+{
+}
+
+void select_nohz_load_balancer(int stop_tick)
+{
+}
+
+void set_cpu_sd_state_idle(void) {}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:	The cpu whose lowest level of sched domain is to
+ *		be returned.
+ * @flag:	The flag to check for the lowest sched_domain
+ *		for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd)
+		if (sd && (sd->flags & flag))
+			break;
+
+	return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:	The cpu whose domains we're iterating over.
+ * @sd:		variable holding the value of the power_savings_sd
+ *		for cpu.
+ * @flag:	The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+	for (sd = lowest_flag_domain(cpu, flag); \
+		(sd && (sd->flags & flag)); sd = sd->parent)
+
+#endif /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+	int i, cpu = smp_processor_id();
+	struct sched_domain *sd;
+
+	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+		return cpu;
+
+	rcu_read_lock();
+	for_each_domain(cpu, sd) {
+		for_each_cpu(i, sched_domain_span(sd)) {
+			if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
+				cpu = i;
+				goto unlock;
+			}
+		}
+	}
+
+	if (!is_housekeeping_cpu(cpu))
+		cpu = housekeeping_any_cpu();
+unlock:
+	rcu_read_unlock();
+	return cpu;
+}
+
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+	if (cpu == smp_processor_id())
+		return;
+
+	set_tsk_need_resched(cpu_rq(cpu)->idle);
+	smp_send_reschedule(cpu);
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+	wake_up_idle_cpu(cpu);
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
+{
+	bool running_wrong = false;
+	bool queued = false;
+	unsigned long flags;
+	struct rq *rq;
+	int ret = 0;
+
+	rq = task_grq_lock(p, &flags);
+
+	/*
+	 * Must re-check here, to close a race against __kthread_bind(),
+	 * sched_setaffinity() is not guaranteed to observe the flag.
+	 */
+	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	queued = task_queued(p);
+
+	do_set_cpus_allowed(p, new_mask);
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	if (task_running(p)) {
+		/* Task is running on the wrong cpu now, reschedule it. */
+		if (rq == this_rq()) {
+			set_tsk_need_resched(p);
+			running_wrong = true;
+		} else
+			resched_task(p);
+	} else
+		set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask));
+
+out:
+	if (queued)
+		try_preempt(p, rq);
+	task_grq_unlock(&flags);
+
+	if (running_wrong)
+		preempt_schedule_common();
+
+	return ret;
+}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	return __set_cpus_allowed_ptr(p, new_mask, false);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* Run through task list and find tasks affined to the dead cpu, then remove
+ * that cpu from the list, enable cpu0 and set the zerobound flag. */
+static void bind_zero(int src_cpu)
+{
+	struct task_struct *p, *t;
+	int bound = 0;
+
+	if (src_cpu == 0)
+		return;
+
+	do_each_thread(t, p) {
+		if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) {
+			cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p));
+			cpumask_set_cpu(0, tsk_cpus_allowed(p));
+			p->zerobound = true;
+			bound++;
+		}
+		clear_sticky(p);
+	} while_each_thread(t, p);
+
+	if (bound) {
+		printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n",
+		       bound, src_cpu);
+	}
+}
+
+/* Find processes with the zerobound flag and reenable their affinity for the
+ * CPU coming alive. */
+static void unbind_zero(int src_cpu)
+{
+	int unbound = 0, zerobound = 0;
+	struct task_struct *p, *t;
+
+	if (src_cpu == 0)
+		return;
+
+	do_each_thread(t, p) {
+		if (!p->mm)
+			p->zerobound = false;
+		if (p->zerobound) {
+			unbound++;
+			cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p));
+			/* Once every CPU affinity has been re-enabled, remove
+			 * the zerobound flag */
+			if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) {
+				p->zerobound = false;
+				zerobound++;
+			}
+		}
+	} while_each_thread(t, p);
+
+	if (unbound) {
+		printk(KERN_INFO "Added affinity for %d processes to cpu %d\n",
+		       unbound, src_cpu);
+	}
+	if (zerobound) {
+		printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n",
+		       zerobound);
+	}
+}
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+	struct mm_struct *mm = current->active_mm;
+
+	BUG_ON(cpu_online(smp_processor_id()));
+
+	if (mm != &init_mm) {
+		switch_mm(mm, &init_mm, current);
+		finish_arch_post_lock_switch();
+	}
+	mmdrop(mm);
+}
+#else /* CONFIG_HOTPLUG_CPU */
+static void unbind_zero(int src_cpu) {}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+	struct sched_param stop_param = { .sched_priority = STOP_PRIO };
+	struct sched_param start_param = { .sched_priority = 0 };
+	struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+	if (stop) {
+		/*
+		 * Make it appear like a SCHED_FIFO task, its something
+		 * userspace knows about and won't get confused about.
+		 *
+		 * Also, it will make PI more or less work without too
+		 * much confusion -- but then, stop work should not
+		 * rely on PI working anyway.
+		 */
+		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
+	}
+
+	cpu_rq(cpu)->stop = stop;
+
+	if (old_stop) {
+		/*
+		 * Reset it back to a normal scheduling policy so that
+		 * it can die in pieces.
+		 */
+		sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
+	}
+}
+
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+	{
+		.procname	= "sched_domain",
+		.mode		= 0555,
+	},
+	{}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= sd_ctl_dir,
+	},
+	{}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+	struct ctl_table *entry =
+		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+	return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+	struct ctl_table *entry;
+
+	/*
+	 * In the intermediate directories, both the child directory and
+	 * procname are dynamically allocated and could fail but the mode
+	 * will always be set. In the lowest directory the names are
+	 * static strings and all have proc handlers.
+	 */
+	for (entry = *tablep; entry->mode; entry++) {
+		if (entry->child)
+			sd_free_ctl_entry(&entry->child);
+		if (entry->proc_handler == NULL)
+			kfree(entry->procname);
+	}
+
+	kfree(*tablep);
+	*tablep = NULL;
+}
+
+static void
+set_table_entry(struct ctl_table *entry,
+		const char *procname, void *data, int maxlen,
+		mode_t mode, proc_handler *proc_handler)
+{
+	entry->procname = procname;
+	entry->data = data;
+	entry->maxlen = maxlen;
+	entry->mode = mode;
+	entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+	if (table == NULL)
+		return NULL;
+
+	set_table_entry(&table[0], "min_interval", &sd->min_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[1], "max_interval", &sd->max_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[9], "cache_nice_tries",
+		&sd->cache_nice_tries,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[10], "flags", &sd->flags,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[11], "max_newidle_lb_cost",
+		&sd->max_newidle_lb_cost,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[12], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring);
+	/* &table[13] is terminator */
+
+	return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+	struct ctl_table *entry, *table;
+	struct sched_domain *sd;
+	int domain_num = 0, i;
+	char buf[32];
+
+	for_each_domain(cpu, sd)
+		domain_num++;
+	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+	if (table == NULL)
+		return NULL;
+
+	i = 0;
+	for_each_domain(cpu, sd) {
+		snprintf(buf, 32, "domain%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_domain_table(sd);
+		entry++;
+		i++;
+	}
+	return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void register_sched_domain_sysctl(void)
+{
+	int i, cpu_num = num_possible_cpus();
+	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	char buf[32];
+
+	WARN_ON(sd_ctl_dir[0].child);
+	sd_ctl_dir[0].child = entry;
+
+	if (entry == NULL)
+		return;
+
+	for_each_possible_cpu(i) {
+		snprintf(buf, 32, "cpu%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_cpu_table(i);
+		entry++;
+	}
+
+	WARN_ON(sd_sysctl_header);
+	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+static void unregister_sched_domain_sysctl(void)
+{
+	unregister_sysctl_table(sd_sysctl_header);
+	sd_sysctl_header = NULL;
+	if (sd_ctl_dir[0].child)
+		sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#else /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
+static void register_sched_domain_sysctl(void)
+{
+}
+static void unregister_sched_domain_sysctl(void)
+{
+}
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
+
+static void set_rq_online(struct rq *rq)
+{
+	if (!rq->online) {
+		cpumask_set_cpu(cpu_of(rq), rq->rd->online);
+		rq->online = true;
+	}
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+	if (rq->online) {
+		cpumask_clear_cpu(cpu_of(rq), rq->rd->online);
+		rq->online = false;
+	}
+}
+
+/*
+ * migration_call - callback that gets triggered when a CPU is added.
+ */
+static int
+migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	unsigned long flags;
+	struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+	struct task_struct *idle = rq->idle;
+#endif
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_STARTING:
+		return NOTIFY_OK;
+	case CPU_UP_PREPARE:
+		break;
+
+	case CPU_ONLINE:
+		/* Update our root-domain */
+		grq_lock_irqsave(&flags);
+		if (rq->rd) {
+			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+
+			set_rq_online(rq);
+		}
+		unbind_zero(cpu);
+		grq.noc = num_online_cpus();
+		grq_unlock_irqrestore(&flags);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		grq_lock_irq();
+		set_rq_task(rq, idle);
+		update_clocks(rq);
+		grq_unlock_irq();
+		break;
+
+	case CPU_DYING:
+		/* Update our root-domain */
+		grq_lock_irqsave(&flags);
+		if (rq->rd) {
+			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+			set_rq_offline(rq);
+		}
+		bind_zero(cpu);
+		grq.noc = num_online_cpus();
+		grq_unlock_irqrestore(&flags);
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
+ */
+static struct notifier_block  migration_notifier = {
+	.notifier_call = migration_call,
+	.priority = CPU_PRI_MIGRATION,
+};
+
+static int sched_cpu_active(struct notifier_block *nfb,
+				      unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_STARTING:
+		return NOTIFY_OK;
+	case CPU_ONLINE:
+		/*
+		 * At this point a starting CPU has marked itself as online via
+		 * set_cpu_online(). But it might not yet have marked itself
+		 * as active, which is essential from here on.
+		 */
+		set_cpu_active(cpu, true);
+		stop_machine_unpark(cpu);
+		return NOTIFY_OK;
+
+	case CPU_DOWN_FAILED:
+		set_cpu_active(cpu, true);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static int sched_cpu_inactive(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		set_cpu_active((long)hcpu, false);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+int __init migration_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err;
+
+	/* Initialise migration for the boot CPU */
+	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	BUG_ON(err == NOTIFY_BAD);
+	migration_call(&migration_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&migration_notifier);
+
+	/* Register cpu active notifiers */
+	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
+	return 0;
+}
+early_initcall(migration_init);
+
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_debug_enabled;
+
+static int __init sched_debug_setup(char *str)
+{
+	sched_debug_enabled = 1;
+
+	return 0;
+}
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+	return sched_debug_enabled;
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+				  struct cpumask *groupmask)
+{
+	cpumask_clear(groupmask);
+
+	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+	if (!(sd->flags & SD_LOAD_BALANCE)) {
+		printk("does not load-balance\n");
+		if (sd->parent)
+			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+					" has parent");
+		return -1;
+	}
+
+	printk(KERN_CONT "span %*pbl level %s\n",
+	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+		printk(KERN_ERR "ERROR: domain->span does not contain "
+				"CPU%d\n", cpu);
+	}
+
+	printk(KERN_CONT "\n");
+
+	if (!cpumask_equal(sched_domain_span(sd), groupmask))
+		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+	if (sd->parent &&
+	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+		printk(KERN_ERR "ERROR: parent span is not a superset "
+			"of domain->span\n");
+	return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
+
+	if (!sched_debug_enabled)
+		return;
+
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	for (;;) {
+		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+			break;
+		level++;
+		sd = sd->parent;
+		if (!sd)
+			break;
+	}
+}
+#else /* !CONFIG_SCHED_DEBUG */
+# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+	return false;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+	if (cpumask_weight(sched_domain_span(sd)) == 1)
+		return 1;
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_AFFINE))
+		return 0;
+
+	return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+		return 0;
+
+	if (~cflags & pflags)
+		return 0;
+
+	return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+	cpupri_cleanup(&rd->cpupri);
+	free_cpumask_var(rd->rto_mask);
+	free_cpumask_var(rd->online);
+	free_cpumask_var(rd->span);
+	kfree(rd);
+}
+
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+	struct root_domain *old_rd = NULL;
+	unsigned long flags;
+
+	grq_lock_irqsave(&flags);
+
+	if (rq->rd) {
+		old_rd = rq->rd;
+
+		if (cpumask_test_cpu(rq->cpu, old_rd->online))
+			set_rq_offline(rq);
+
+		cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+		/*
+		 * If we dont want to free the old_rd yet then
+		 * set old_rd to NULL to skip the freeing later
+		 * in this function:
+		 */
+		if (!atomic_dec_and_test(&old_rd->refcount))
+			old_rd = NULL;
+	}
+
+	atomic_inc(&rd->refcount);
+	rq->rd = rd;
+
+	cpumask_set_cpu(rq->cpu, rd->span);
+	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+		set_rq_online(rq);
+
+	grq_unlock_irqrestore(&flags);
+
+	if (old_rd)
+		call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+	memset(rd, 0, sizeof(*rd));
+
+	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
+		goto out;
+	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
+		goto free_span;
+	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+		goto free_online;
+
+	if (cpupri_init(&rd->cpupri) != 0)
+		goto free_rto_mask;
+	return 0;
+
+free_rto_mask:
+	free_cpumask_var(rd->rto_mask);
+free_online:
+	free_cpumask_var(rd->online);
+free_span:
+	free_cpumask_var(rd->span);
+out:
+	return -ENOMEM;
+}
+
+static void init_defrootdomain(void)
+{
+	init_rootdomain(&def_root_domain);
+
+	atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+	struct root_domain *rd;
+
+	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	if (init_rootdomain(rd) != 0) {
+		kfree(rd);
+		return NULL;
+	}
+
+	return rd;
+}
+
+static void free_sched_domain(struct rcu_head *rcu)
+{
+	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+	kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+	call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+	for (; sd; sd = sd->parent)
+		destroy_sched_domain(sd, cpu);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; ) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+
+		if (sd_parent_degenerate(tmp, parent)) {
+			tmp->parent = parent->parent;
+			if (parent->parent)
+				parent->parent->child = tmp;
+			/*
+			 * Transfer SD_PREFER_SIBLING down in case of a
+			 * degenerate parent; the spans match for this
+			 * so the property transfers.
+			 */
+			if (parent->flags & SD_PREFER_SIBLING)
+				tmp->flags |= SD_PREFER_SIBLING;
+			destroy_sched_domain(parent, cpu);
+		} else
+			tmp = tmp->parent;
+	}
+
+	if (sd && sd_degenerate(sd)) {
+		tmp = sd;
+		sd = sd->parent;
+		destroy_sched_domain(tmp, cpu);
+		if (sd)
+			sd->child = NULL;
+	}
+
+	sched_domain_debug(sd, cpu);
+
+	rq_attach_root(rq, rd);
+	tmp = rq->sd;
+	rcu_assign_pointer(rq->sd, sd);
+	destroy_sched_domains(tmp, cpu);
+}
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	cpulist_parse(str, cpu_isolated_map);
+	return 1;
+}
+
+__setup("isolcpus=", isolated_cpu_setup);
+
+struct s_data {
+	struct sched_domain ** __percpu sd;
+	struct root_domain	*rd;
+};
+
+enum s_alloc {
+	sa_rootdomain,
+	sa_sd,
+	sa_sd_storage,
+	sa_none,
+};
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+	if (kstrtoint(str, 0, &default_relax_domain_level))
+		pr_warn("Unable to set relax_domain_level\n");
+
+	return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+				 struct sched_domain_attr *attr)
+{
+	int request;
+
+	if (!attr || attr->relax_domain_level < 0) {
+		if (default_relax_domain_level < 0)
+			return;
+		else
+			request = default_relax_domain_level;
+	} else
+		request = attr->relax_domain_level;
+	if (request < sd->level) {
+		/* turn off idle balance on this domain */
+		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	} else {
+		/* turn on idle balance on this domain */
+		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	}
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+				 const struct cpumask *cpu_map)
+{
+	switch (what) {
+	case sa_rootdomain:
+		if (!atomic_read(&d->rd->refcount))
+			free_rootdomain(&d->rd->rcu); /* fall through */
+	case sa_sd:
+		free_percpu(d->sd); /* fall through */
+	case sa_sd_storage:
+		__sdt_free(cpu_map); /* fall through */
+	case sa_none:
+		break;
+	}
+}
+
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+						   const struct cpumask *cpu_map)
+{
+	memset(d, 0, sizeof(*d));
+
+	if (__sdt_alloc(cpu_map))
+		return sa_sd_storage;
+	d->sd = alloc_percpu(struct sched_domain *);
+	if (!d->sd)
+		return sa_sd_storage;
+	d->rd = alloc_rootdomain();
+	if (!d->rd)
+		return sa_sd;
+	return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain
+ * structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+	struct sd_data *sdd = sd->private;
+
+	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+	*per_cpu_ptr(sdd->sd, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY      - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA                - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING        - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS		\
+	(SD_SHARE_CPUCAPACITY |		\
+	 SD_SHARE_PKG_RESOURCES |	\
+	 SD_NUMA |			\
+	 SD_ASYM_PACKING |		\
+	 SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl, int cpu)
+{
+	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+	int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+	/*
+	 * Ugly hack to pass state to sd_numa_mask()...
+	 */
+	sched_domains_curr_level = tl->numa_level;
+#endif
+
+	sd_weight = cpumask_weight(tl->mask(cpu));
+
+	if (tl->sd_flags)
+		sd_flags = (*tl->sd_flags)();
+	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+			"wrong sd_flags in topology description\n"))
+		sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+	*sd = (struct sched_domain){
+		.min_interval		= sd_weight,
+		.max_interval		= 2*sd_weight,
+		.busy_factor		= 32,
+		.imbalance_pct		= 125,
+
+		.cache_nice_tries	= 0,
+		.busy_idx		= 0,
+		.idle_idx		= 0,
+		.newidle_idx		= 0,
+		.wake_idx		= 0,
+		.forkexec_idx		= 0,
+
+		.flags			= 1*SD_LOAD_BALANCE
+					| 1*SD_BALANCE_NEWIDLE
+					| 1*SD_BALANCE_EXEC
+					| 1*SD_BALANCE_FORK
+					| 0*SD_BALANCE_WAKE
+					| 1*SD_WAKE_AFFINE
+					| 0*SD_SHARE_CPUCAPACITY
+					| 0*SD_SHARE_PKG_RESOURCES
+					| 0*SD_SERIALIZE
+					| 0*SD_PREFER_SIBLING
+					| 0*SD_NUMA
+					| sd_flags
+					,
+
+		.last_balance		= jiffies,
+		.balance_interval	= sd_weight,
+		.smt_gain		= 0,
+		.max_newidle_lb_cost	= 0,
+		.next_decay_max_lb_cost	= jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+		.name			= tl->name,
+#endif
+	};
+
+	/*
+	 * Convert topological properties into behaviour.
+	 */
+
+	if (sd->flags & SD_SHARE_CPUCAPACITY) {
+		sd->flags |= SD_PREFER_SIBLING;
+		sd->imbalance_pct = 110;
+		sd->smt_gain = 1178; /* ~15% */
+
+	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+		sd->imbalance_pct = 117;
+		sd->cache_nice_tries = 1;
+		sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+	} else if (sd->flags & SD_NUMA) {
+		sd->cache_nice_tries = 2;
+		sd->busy_idx = 3;
+		sd->idle_idx = 2;
+
+		sd->flags |= SD_SERIALIZE;
+		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+			sd->flags &= ~(SD_BALANCE_EXEC |
+				       SD_BALANCE_FORK |
+				       SD_WAKE_AFFINE);
+		}
+
+#endif
+	} else {
+		sd->flags |= SD_PREFER_SIBLING;
+		sd->cache_nice_tries = 1;
+		sd->busy_idx = 2;
+		sd->idle_idx = 1;
+	}
+
+	sd->private = &tl->data;
+
+	return sd;
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology =
+	default_topology;
+
+#define for_each_sd_topology(tl)			\
+	for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+	sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+	static int done = false;
+	int i,j;
+
+	if (done)
+		return;
+
+	done = true;
+
+	printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+	for (i = 0; i < nr_node_ids; i++) {
+		printk(KERN_WARNING "  ");
+		for (j = 0; j < nr_node_ids; j++)
+			printk(KERN_CONT "%02d ", node_distance(i,j));
+		printk(KERN_CONT "\n");
+	}
+	printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+	int i;
+
+	if (distance == node_distance(0, 0))
+		return true;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		if (sched_domains_numa_distance[i] == distance)
+			return true;
+	}
+
+	return false;
+}
+
+static void sched_init_numa(void)
+{
+	int next_distance, curr_distance = node_distance(0, 0);
+	struct sched_domain_topology_level *tl;
+	int level = 0;
+	int i, j, k;
+
+	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+	if (!sched_domains_numa_distance)
+		return;
+
+	/*
+	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+	 * unique distances in the node_distance() table.
+	 *
+	 * Assumes node_distance(0,j) includes all distances in
+	 * node_distance(i,j) in order to avoid cubic time.
+	 */
+	next_distance = curr_distance;
+	for (i = 0; i < nr_node_ids; i++) {
+		for (j = 0; j < nr_node_ids; j++) {
+			for (k = 0; k < nr_node_ids; k++) {
+				int distance = node_distance(i, k);
+
+				if (distance > curr_distance &&
+				    (distance < next_distance ||
+				     next_distance == curr_distance))
+					next_distance = distance;
+
+				/*
+				 * While not a strong assumption it would be nice to know
+				 * about cases where if node A is connected to B, B is not
+				 * equally connected to A.
+				 */
+				if (sched_debug() && node_distance(k, i) != distance)
+					sched_numa_warn("Node-distance not symmetric");
+
+				if (sched_debug() && i && !find_numa_distance(distance))
+					sched_numa_warn("Node-0 not representative");
+			}
+			if (next_distance != curr_distance) {
+				sched_domains_numa_distance[level++] = next_distance;
+				sched_domains_numa_levels = level;
+				curr_distance = next_distance;
+			} else break;
+		}
+
+		/*
+		 * In case of sched_debug() we verify the above assumption.
+		 */
+		if (!sched_debug())
+			break;
+	}
+	/*
+	 * 'level' contains the number of unique distances, excluding the
+	 * identity distance node_distance(i,i).
+	 *
+	 * The sched_domains_numa_distance[] array includes the actual distance
+	 * numbers.
+	 */
+
+	/*
+	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
+	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
+	 * the array will contain less then 'level' members. This could be
+	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+	 * in other functions.
+	 *
+	 * We reset it to 'level' at the end of this function.
+	 */
+	sched_domains_numa_levels = 0;
+
+	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+	if (!sched_domains_numa_masks)
+		return;
+
+	/*
+	 * Now for each level, construct a mask per node which contains all
+	 * cpus of nodes that are that many hops away from us.
+	 */
+	for (i = 0; i < level; i++) {
+		sched_domains_numa_masks[i] =
+			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+		if (!sched_domains_numa_masks[i])
+			return;
+
+		for (j = 0; j < nr_node_ids; j++) {
+			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+			if (!mask)
+				return;
+
+			sched_domains_numa_masks[i][j] = mask;
+
+			for_each_node(k) {
+				if (node_distance(j, k) > sched_domains_numa_distance[i])
+					continue;
+
+				cpumask_or(mask, mask, cpumask_of_node(k));
+			}
+		}
+	}
+
+	/* Compute default topology size */
+	for (i = 0; sched_domain_topology[i].mask; i++);
+
+	tl = kzalloc((i + level + 1) *
+			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+	if (!tl)
+		return;
+
+	/*
+	 * Copy the default topology bits..
+	 */
+	for (i = 0; sched_domain_topology[i].mask; i++)
+		tl[i] = sched_domain_topology[i];
+
+	/*
+	 * .. and append 'j' levels of NUMA goodness.
+	 */
+	for (j = 0; j < level; i++, j++) {
+		tl[i] = (struct sched_domain_topology_level){
+			.mask = sd_numa_mask,
+			.sd_flags = cpu_numa_flags,
+			.flags = SDTL_OVERLAP,
+			.numa_level = j,
+			SD_INIT_NAME(NUMA)
+		};
+	}
+
+	sched_domain_topology = tl;
+
+	sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+	int i, j;
+	int node = cpu_to_node(cpu);
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		for (j = 0; j < nr_node_ids; j++) {
+			if (node_distance(j, node) <= sched_domains_numa_distance[i])
+				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+		}
+	}
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+	int i, j;
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		for (j = 0; j < nr_node_ids; j++)
+			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+	}
+}
+
+/*
+ * Update sched_domains_numa_masks[level][node] array when new cpus
+ * are onlined.
+ */
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+					   unsigned long action,
+					   void *hcpu)
+{
+	int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+		sched_domains_numa_masks_set(cpu);
+		break;
+
+	case CPU_DEAD:
+		sched_domains_numa_masks_clear(cpu);
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+					   unsigned long action,
+					   void *hcpu)
+{
+	return 0;
+}
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for_each_sd_topology(tl) {
+		struct sd_data *sdd = &tl->data;
+
+		sdd->sd = alloc_percpu(struct sched_domain *);
+		if (!sdd->sd)
+			return -ENOMEM;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd;
+
+			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sd)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sd, j) = sd;
+		}
+	}
+
+	return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for_each_sd_topology(tl) {
+		struct sd_data *sdd = &tl->data;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd;
+
+			if (sdd->sd) {
+				sd = *per_cpu_ptr(sdd->sd, j);
+				kfree(*per_cpu_ptr(sdd->sd, j));
+			}
+		}
+		free_percpu(sdd->sd);
+		sdd->sd = NULL;
+	}
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+		struct sched_domain *child, int cpu)
+{
+	struct sched_domain *sd = sd_init(tl, cpu);
+	if (!sd)
+		return child;
+
+	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+	if (child) {
+		sd->level = child->level + 1;
+		sched_domain_level_max = max(sched_domain_level_max, sd->level);
+		child->parent = sd;
+		sd->child = child;
+
+		if (!cpumask_subset(sched_domain_span(child),
+				    sched_domain_span(sd))) {
+			pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+			pr_err("     the %s domain not a subset of the %s domain\n",
+					child->name, sd->name);
+#endif
+			/* Fixup, ensure @sd has at least @child cpus. */
+			cpumask_or(sched_domain_span(sd),
+				   sched_domain_span(sd),
+				   sched_domain_span(child));
+		}
+
+	}
+	set_domain_attribute(sd, attr);
+
+	return sd;
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const struct cpumask *cpu_map,
+			       struct sched_domain_attr *attr)
+{
+	enum s_alloc alloc_state;
+	struct sched_domain *sd;
+	struct s_data d;
+	int i, ret = -ENOMEM;
+
+	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+	if (alloc_state != sa_rootdomain)
+		goto error;
+
+	/* Set up domains for cpus specified by the cpu_map. */
+	for_each_cpu(i, cpu_map) {
+		struct sched_domain_topology_level *tl;
+
+		sd = NULL;
+		for_each_sd_topology(tl) {
+			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+			if (tl == sched_domain_topology)
+				*per_cpu_ptr(d.sd, i) = sd;
+			if (tl->flags & SDTL_OVERLAP)
+				sd->flags |= SD_OVERLAP;
+			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+				break;
+		}
+	}
+
+	/* Calculate CPU capacity for physical packages and nodes */
+	for (i = nr_cpumask_bits-1; i >= 0; i--) {
+		if (!cpumask_test_cpu(i, cpu_map))
+			continue;
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			claim_allocations(i, sd);
+		}
+	}
+
+	/* Attach the domains */
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map) {
+		sd = *per_cpu_ptr(d.sd, i);
+		cpu_attach_domain(sd, d.rd, i);
+	}
+	rcu_read_unlock();
+
+	ret = 0;
+error:
+	__free_domain_allocs(&d, alloc_state, cpu_map);
+	return ret;
+}
+
+static cpumask_var_t *doms_cur;	/* current sched domains */
+static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+				/* attribues of custom domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+	return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+	int i;
+	cpumask_var_t *doms;
+
+	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+	if (!doms)
+		return NULL;
+	for (i = 0; i < ndoms; i++) {
+		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+			free_sched_domains(doms, i);
+			return NULL;
+		}
+	}
+	return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+	unsigned int i;
+	for (i = 0; i < ndoms; i++)
+		free_cpumask_var(doms[i]);
+	kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+static int init_sched_domains(const struct cpumask *cpu_map)
+{
+	int err;
+
+	arch_update_cpu_topology();
+	ndoms_cur = 1;
+	doms_cur = alloc_sched_domains(ndoms_cur);
+	if (!doms_cur)
+		doms_cur = &fallback_doms;
+	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+	err = build_sched_domains(doms_cur[0], NULL);
+	register_sched_domain_sysctl();
+
+	return err;
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+	int i;
+
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map)
+		cpu_attach_domain(NULL, &def_root_domain, i);
+	rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+			struct sched_domain_attr *new, int idx_new)
+{
+	struct sched_domain_attr tmp;
+
+	/* fast path */
+	if (!new && !cur)
+		return 1;
+
+	tmp = SD_ATTR_INIT;
+	return !memcmp(cur ? (cur + idx_cur) : &tmp,
+			new ? (new + idx_new) : &tmp,
+			sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains.  This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+			     struct sched_domain_attr *dattr_new)
+{
+	int i, j, n;
+	int new_topology;
+
+	mutex_lock(&sched_domains_mutex);
+
+	/* always unregister in case we don't destroy any domains */
+	unregister_sched_domain_sysctl();
+
+	/* Let architecture update cpu core mappings. */
+	new_topology = arch_update_cpu_topology();
+
+	n = doms_new ? ndoms_new : 0;
+
+	/* Destroy deleted domains */
+	for (i = 0; i < ndoms_cur; i++) {
+		for (j = 0; j < n && !new_topology; j++) {
+			if (cpumask_equal(doms_cur[i], doms_new[j])
+			    && dattrs_equal(dattr_cur, i, dattr_new, j))
+				goto match1;
+		}
+		/* no match - a current sched domain not in new doms_new[] */
+		detach_destroy_domains(doms_cur[i]);
+match1:
+		;
+	}
+
+	n = ndoms_cur;
+	if (doms_new == NULL) {
+		n = 0;
+		doms_new = &fallback_doms;
+		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		WARN_ON_ONCE(dattr_new);
+	}
+
+	/* Build new domains */
+	for (i = 0; i < ndoms_new; i++) {
+		for (j = 0; j < n && !new_topology; j++) {
+			if (cpumask_equal(doms_new[i], doms_cur[j])
+			    && dattrs_equal(dattr_new, i, dattr_cur, j))
+				goto match2;
+		}
+		/* no match - add a new doms_new */
+		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+		;
+	}
+
+	/* Remember the new sched domains */
+	if (doms_cur != &fallback_doms)
+		free_sched_domains(doms_cur, ndoms_cur);
+	kfree(dattr_cur);	/* kfree(NULL) is safe */
+	doms_cur = doms_new;
+	dattr_cur = dattr_new;
+	ndoms_cur = ndoms_new;
+
+	register_sched_domain_sysctl();
+
+	mutex_unlock(&sched_domains_mutex);
+}
+
+static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
+
+/*
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
+ */
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+			     void *hcpu)
+{
+	switch (action) {
+	case CPU_ONLINE_FROZEN:
+	case CPU_DOWN_FAILED_FROZEN:
+
+		/*
+		 * num_cpus_frozen tracks how many CPUs are involved in suspend
+		 * resume sequence. As long as this is not the last online
+		 * operation in the resume sequence, just build a single sched
+		 * domain, ignoring cpusets.
+		 */
+		num_cpus_frozen--;
+		if (likely(num_cpus_frozen)) {
+			partition_sched_domains(1, NULL, NULL);
+			break;
+		}
+
+		/*
+		 * This is the last CPU online operation. So fall through and
+		 * restore the original sched domains by considering the
+		 * cpuset configurations.
+		 */
+
+	case CPU_ONLINE:
+		cpuset_update_active_cpus(true);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+	return NOTIFY_OK;
+}
+
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+			       void *hcpu)
+{
+	switch (action) {
+	case CPU_DOWN_PREPARE:
+		cpuset_update_active_cpus(false);
+		break;
+	case CPU_DOWN_PREPARE_FROZEN:
+		num_cpus_frozen++;
+		partition_sched_domains(1, NULL, NULL);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+	return NOTIFY_OK;
+}
+
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+/*
+ * Cheaper version of the below functions in case support for SMT and MC is
+ * compiled in but CPUs have no siblings.
+ */
+static bool sole_cpu_idle(int cpu)
+{
+	return rq_idle(cpu_rq(cpu));
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static const cpumask_t *thread_cpumask(int cpu)
+{
+	return topology_sibling_cpumask(cpu);
+}
+/* All this CPU's SMT siblings are idle */
+static bool siblings_cpu_idle(int cpu)
+{
+	return cpumask_subset(thread_cpumask(cpu), &grq.cpu_idle_map);
+}
+#endif
+#ifdef CONFIG_SCHED_MC
+static const cpumask_t *core_cpumask(int cpu)
+{
+	return topology_core_cpumask(cpu);
+}
+/* All this CPU's shared cache siblings are idle */
+static bool cache_cpu_idle(int cpu)
+{
+	return cpumask_subset(core_cpumask(cpu), &grq.cpu_idle_map);
+}
+#endif
+
+enum sched_domain_level {
+	SD_LV_NONE = 0,
+	SD_LV_SIBLING,
+	SD_LV_MC,
+	SD_LV_BOOK,
+	SD_LV_CPU,
+	SD_LV_NODE,
+	SD_LV_ALLNODES,
+	SD_LV_MAX
+};
+
+void __init sched_init_smp(void)
+{
+	struct sched_domain *sd;
+	int cpu, other_cpu;
+
+	cpumask_var_t non_isolated_cpus;
+
+	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
+	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+	sched_init_numa();
+
+	/*
+	 * There's no userspace yet to cause hotplug operations; hence all the
+	 * cpu masks are stable and all blatant races in the below code cannot
+	 * happen.
+	 */
+	mutex_lock(&sched_domains_mutex);
+	init_sched_domains(cpu_active_mask);
+	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+	if (cpumask_empty(non_isolated_cpus))
+		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
+	mutex_unlock(&sched_domains_mutex);
+
+	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
+	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
+
+	/* Move init over to a non-isolated CPU */
+	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+		BUG();
+	free_cpumask_var(non_isolated_cpus);
+
+	mutex_lock(&sched_domains_mutex);
+	grq_lock_irq();
+	/*
+	 * Set up the relative cache distance of each online cpu from each
+	 * other in a simple array for quick lookup. Locality is determined
+	 * by the closest sched_domain that CPUs are separated by. CPUs with
+	 * shared cache in SMT and MC are treated as local. Separate CPUs
+	 * (within the same package or physically) within the same node are
+	 * treated as not local. CPUs not even in the same domain (different
+	 * nodes) are treated as very distant.
+	 */
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		/* First check if this cpu is in the same node */
+		for_each_domain(cpu, sd) {
+			if (sd->level > SD_LV_NODE)
+				continue;
+			/* Set locality to local node if not already found lower */
+			for_each_cpu(other_cpu, sched_domain_span(sd)) {
+				if (rq->cpu_locality[other_cpu] > 3)
+					rq->cpu_locality[other_cpu] = 3;
+			}
+		}
+
+		/*
+		 * Each runqueue has its own function in case it doesn't have
+		 * siblings of its own allowing mixed topologies.
+		 */
+#ifdef CONFIG_SCHED_MC
+		for_each_cpu(other_cpu, core_cpumask(cpu)) {
+			if (rq->cpu_locality[other_cpu] > 2)
+				rq->cpu_locality[other_cpu] = 2;
+		}
+		if (cpumask_weight(core_cpumask(cpu)) > 1)
+			rq->cache_idle = cache_cpu_idle;
+#endif
+#ifdef CONFIG_SCHED_SMT
+		for_each_cpu(other_cpu, thread_cpumask(cpu))
+			rq->cpu_locality[other_cpu] = 1;
+		if (cpumask_weight(thread_cpumask(cpu)) > 1)
+			rq->siblings_idle = siblings_cpu_idle;
+#endif
+	}
+	grq_unlock_irq();
+	mutex_unlock(&sched_domains_mutex);
+
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		for_each_online_cpu(other_cpu) {
+			if (other_cpu <= cpu)
+				continue;
+			printk(KERN_DEBUG "BFS LOCALITY CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
+		}
+	}
+}
+#else
+void __init sched_init_smp(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+	return in_lock_functions(addr) ||
+		(addr >= (unsigned long)__sched_text_start
+		&& addr < (unsigned long)__sched_text_end);
+}
+
+void __init sched_init(void)
+{
+#ifdef CONFIG_SMP
+	int cpu_ids;
+#endif
+	int i;
+	struct rq *rq;
+
+	prio_ratios[0] = 128;
+	for (i = 1 ; i < NICE_WIDTH ; i++)
+		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
+
+	raw_spin_lock_init(&grq.lock);
+	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
+	grq.niffies = 0;
+	grq.last_jiffy = jiffies;
+	raw_spin_lock_init(&grq.iso_lock);
+	grq.iso_ticks = 0;
+	grq.iso_refractory = false;
+	grq.noc = 1;
+#ifdef CONFIG_SMP
+	init_defrootdomain();
+	grq.qnr = grq.idle_cpus = 0;
+	cpumask_clear(&grq.cpu_idle_map);
+#else
+	uprq = &per_cpu(runqueues, 0);
+#endif
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		rq->grq_lock = &grq.lock;
+		rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
+			      rq->iowait_pc = rq->idle_pc = 0;
+		rq->dither = false;
+#ifdef CONFIG_SMP
+		rq->sticky_task = NULL;
+		rq->last_niffy = 0;
+		rq->sd = NULL;
+		rq->rd = NULL;
+		rq->online = false;
+		rq->cpu = i;
+		rq_attach_root(rq, &def_root_domain);
+#endif
+		atomic_set(&rq->nr_iowait, 0);
+	}
+
+#ifdef CONFIG_SMP
+	cpu_ids = i;
+	/*
+	 * Set the base locality for cpu cache distance calculation to
+	 * "distant" (3). Make sure the distance from a CPU to itself is 0.
+	 */
+	for_each_possible_cpu(i) {
+		int j;
+
+		rq = cpu_rq(i);
+#ifdef CONFIG_SCHED_SMT
+		rq->siblings_idle = sole_cpu_idle;
+#endif
+#ifdef CONFIG_SCHED_MC
+		rq->cache_idle = sole_cpu_idle;
+#endif
+		rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC);
+		for_each_possible_cpu(j) {
+			if (i == j)
+				rq->cpu_locality[j] = 0;
+			else
+				rq->cpu_locality[j] = 4;
+		}
+	}
+#endif
+
+	for (i = 0; i < PRIO_LIMIT; i++)
+		INIT_LIST_HEAD(grq.queue + i);
+	/* delimiter for bitsearch */
+	__set_bit(PRIO_LIMIT, grq.prio_bitmap);
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
+	/*
+	 * The boot idle thread does lazy MMU switching as well:
+	 */
+	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current);
+
+	/*
+	 * Make us the idle thread. Technically, schedule() should not be
+	 * called from this thread, however somewhere below it might be,
+	 * but because we are the idle thread, we just pick up running again
+	 * when this runqueue becomes "idle".
+	 */
+	init_idle(current, smp_processor_id());
+
+#ifdef CONFIG_SMP
+	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
+	/* May be allocated at isolcpus cmdline parse time */
+	if (cpu_isolated_map == NULL)
+		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+	idle_thread_set_boot_cpu();
+#endif /* SMP */
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+	int nested = preempt_count() + rcu_preempt_depth();
+
+	return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+	/*
+	 * Blocking primitives will set (and therefore destroy) current->state,
+	 * since we will exit with TASK_RUNNING make sure we enter with it,
+	 * otherwise we will destroy state.
+	 */
+	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+			"do not call blocking ops when !TASK_RUNNING; "
+			"state=%lx set at [<%p>] %pS\n",
+			current->state,
+			(void *)current->task_state_change,
+			(void *)current->task_state_change);
+
+	___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
+	static unsigned long prev_jiffy;	/* ratelimiting */
+
+	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+	     !is_idle_task(current)) ||
+	    system_state != SYSTEM_RUNNING || oops_in_progress)
+		return;
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR
+		"BUG: sleeping function called from invalid context at %s:%d\n",
+			file, line);
+	printk(KERN_ERR
+		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	if (task_stack_end_corrupted(current))
+		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+	debug_show_held_locks(current);
+	if (irqs_disabled())
+		print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (!preempt_count_equals(preempt_offset)) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
+	dump_stack();
+}
+EXPORT_SYMBOL(___might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static inline void normalise_rt_tasks(void)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+	struct rq *rq;
+	int queued;
+
+	read_lock(&tasklist_lock);
+	for_each_process_thread(g, p) {
+		/*
+		 * Only normalize user tasks:
+		 */
+		if (p->flags & PF_KTHREAD)
+			continue;
+
+		if (!rt_task(p) && !iso_task(p))
+			continue;
+
+		rq = task_grq_lock(p, &flags);
+		queued = task_queued(p);
+		if (queued)
+			dequeue_task(p);
+		__setscheduler(p, rq, SCHED_NORMAL, 0, false);
+		if (queued) {
+			enqueue_task(p, rq);
+			try_preempt(p, rq);
+		}
+
+		task_grq_unlock(&flags);
+	}
+	read_unlock(&tasklist_lock);
+}
+
+void normalize_rt_tasks(void)
+{
+	normalise_rt_tasks();
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
+ */
+struct task_struct *curr_task(int cpu)
+{
+	return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronised, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, struct task_struct *p)
+{
+	cpu_curr(cpu) = p;
+}
+
+#endif
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	*ut = p->utime;
+	*st = p->stime;
+}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+
+	*ut = cputime.utime;
+	*st = cputime.stime;
+}
+
+void vtime_account_system_irqsafe(struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	vtime_account_system(tsk);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_task_switch(struct task_struct *prev)
+{
+	if (is_idle_task(prev))
+		vtime_account_idle(prev);
+	else
+		vtime_account_system(prev);
+
+	vtime_account_user(prev);
+	arch_vtime_task_switch(prev);
+}
+#endif
+
+#else
+/*
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * losing precision when the numbers are big.
+ */
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+{
+	u64 scaled;
+
+	for (;;) {
+		/* Make sure "rtime" is the bigger of stime/rtime */
+		if (stime > rtime) {
+			u64 tmp = rtime; rtime = stime; stime = tmp;
+		}
+
+		/* Make sure 'total' fits in 32 bits */
+		if (total >> 32)
+			goto drop_precision;
+
+		/* Does rtime (and thus stime) fit in 32 bits? */
+		if (!(rtime >> 32))
+			break;
+
+		/* Can we just balance rtime/stime rather than dropping bits? */
+		if (stime >> 31)
+			goto drop_precision;
+
+		/* We can grow stime and shrink rtime and try to make them both fit */
+		stime <<= 1;
+		rtime >>= 1;
+		continue;
+
+drop_precision:
+		/* We drop from rtime, it has more bits than stime */
+		rtime >>= 1;
+		total >>= 1;
+	}
+
+	/*
+	 * Make sure gcc understands that this is a 32x32->64 multiply,
+	 * followed by a 64/32->64 divide.
+	 */
+	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+	return (__force cputime_t) scaled;
+}
+
+/*
+ * Adjust tick based cputime random precision against scheduler
+ * runtime accounting.
+ */
+static void cputime_adjust(struct task_cputime *curr,
+			   struct prev_cputime *prev,
+			   cputime_t *ut, cputime_t *st)
+{
+	cputime_t rtime, stime, utime, total;
+
+	stime = curr->stime;
+	total = stime + curr->utime;
+
+	/*
+	 * Tick based cputime accounting depend on random scheduling
+	 * timeslices of a task to be interrupted or not by the timer.
+	 * Depending on these circumstances, the number of these interrupts
+	 * may be over or under-optimistic, matching the real user and system
+	 * cputime with a variable precision.
+	 *
+	 * Fix this by scaling these tick based values against the total
+	 * runtime accounted by the CFS scheduler.
+	 */
+	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+
+	/*
+	 * Update userspace visible utime/stime values only if actual execution
+	 * time is bigger than already exported. Note that can happen, that we
+	 * provided bigger values due to scaling inaccuracy on big numbers.
+	 */
+	if (prev->stime + prev->utime >= rtime)
+		goto out;
+
+	if (total) {
+		stime = scale_stime((__force u64)stime,
+				    (__force u64)rtime, (__force u64)total);
+		utime = rtime - stime;
+	} else {
+		stime = rtime;
+		utime = 0;
+	}
+
+	/*
+	 * If the tick based count grows faster than the scheduler one,
+	 * the result of the scaling may go backward.
+	 * Let's enforce monotonicity.
+	 */
+	prev->stime = max(prev->stime, stime);
+	prev->utime = max(prev->utime, utime);
+
+out:
+	*ut = prev->utime;
+	*st = prev->stime;
+}
+
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime = {
+		.sum_exec_runtime = tsk_seruntime(p),
+	};
+
+	task_cputime(p, &cputime.utime, &cputime.stime);
+	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
+}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
+}
+#endif
+
+void init_idle_bootup_task(struct task_struct *idle)
+{}
+
+#ifdef CONFIG_SCHED_DEBUG
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{}
+
+void proc_sched_set_task(struct task_struct *p)
+{}
+#endif
+
+#ifdef CONFIG_SMP
+#define SCHED_LOAD_SHIFT	(10)
+#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
+
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long smt_gain = sd->smt_gain;
+
+	smt_gain /= weight;
+
+	return smt_gain;
+}
+#endif
diff --git b/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h
new file mode 100644
index 0000000..5d97f79
--- /dev/null
+++ b/kernel/sched/bfs_sched.h
@@ -0,0 +1,181 @@
+#include <linux/sched.h>
+#include <linux/cpuidle.h>
+#include <linux/stop_machine.h>
+
+#ifndef BFS_SCHED_H
+#define BFS_SCHED_H
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ * This data should only be modified by the local cpu.
+ */
+struct rq {
+	struct task_struct *curr, *idle, *stop;
+	struct mm_struct *prev_mm;
+
+	/* Pointer to grq spinlock */
+	raw_spinlock_t *grq_lock;
+
+	/* Stored data about rq->curr to work outside grq lock */
+	u64 rq_deadline;
+	unsigned int rq_policy;
+	int rq_time_slice;
+	u64 rq_last_ran;
+	int rq_prio;
+	bool rq_running; /* There is a task running */
+	int soft_affined; /* Running or queued tasks with this set as their rq */
+#ifdef CONFIG_SMT_NICE
+	struct mm_struct *rq_mm;
+	int rq_smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+	/* Accurate timekeeping data */
+	u64 timekeep_clock;
+	unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
+		iowait_pc, idle_pc;
+	atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+	int cpu;		/* cpu of this runqueue */
+	bool online;
+	bool scaling; /* This CPU is managed by a scaling CPU freq governor */
+	struct task_struct *sticky_task;
+
+	struct root_domain *rd;
+	struct sched_domain *sd;
+	int *cpu_locality; /* CPU relative cache distance */
+#ifdef CONFIG_SCHED_SMT
+	bool (*siblings_idle)(int cpu);
+	/* See if all smt siblings are idle */
+#endif /* CONFIG_SCHED_SMT */
+#ifdef CONFIG_SCHED_MC
+	bool (*cache_idle)(int cpu);
+	/* See if all cache siblings are idle */
+#endif /* CONFIG_SCHED_MC */
+	u64 last_niffy; /* Last time this RQ updated grq.niffies */
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	u64 prev_irq_time;
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_PARAVIRT
+	u64 prev_steal_time;
+#endif /* CONFIG_PARAVIRT */
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+	u64 prev_steal_time_rq;
+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
+
+	u64 clock, old_clock, last_tick;
+	u64 clock_task;
+	bool dither;
+
+#ifdef CONFIG_SCHEDSTATS
+
+	/* latency stats */
+	struct sched_info rq_sched_info;
+	unsigned long long rq_cpu_time;
+	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+	/* sys_sched_yield() stats */
+	unsigned int yld_count;
+
+	/* schedule() stats */
+	unsigned int sched_switch;
+	unsigned int sched_count;
+	unsigned int sched_goidle;
+
+	/* try_to_wake_up() stats */
+	unsigned int ttwu_count;
+	unsigned int ttwu_local;
+#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_CPU_IDLE
+	/* Must be inspected within a rcu lock section */
+	struct cpuidle_state *idle_state;
+#endif
+};
+
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu);
+#endif
+
+#ifndef CONFIG_SMP
+extern struct rq *uprq;
+#define cpu_rq(cpu)	(uprq)
+#define this_rq()	(uprq)
+#define raw_rq()	(uprq)
+#define task_rq(p)	(uprq)
+#define cpu_curr(cpu)	((uprq)->curr)
+#else /* CONFIG_SMP */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#define this_rq()		this_cpu_ptr(&runqueues)
+#define raw_rq()		raw_cpu_ptr(&runqueues)
+#endif /* CONFIG_SMP */
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+	return READ_ONCE(rq->clock);
+}
+
+static inline u64 rq_clock(struct rq *rq)
+{
+	lockdep_assert_held(rq->grq_lock);
+	return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+	lockdep_assert_held(rq->grq_lock);
+	return rq->clock_task;
+}
+
+extern struct mutex sched_domains_mutex;
+
+#define rcu_dereference_check_sched_domain(p) \
+	rcu_dereference_check((p), \
+			      lockdep_is_held(&sched_domains_mutex))
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+
+static inline void sched_ttwu_pending(void) { }
+
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+	return p->on_rq;
+}
+
+#ifdef CONFIG_SMP
+
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+				  struct cpuidle_state *idle_state)
+{
+	rq->idle_state = idle_state;
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+	WARN_ON(!rcu_read_lock_held());
+	return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+				  struct cpuidle_state *idle_state)
+{
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+	return NULL;
+}
+#endif
+#endif /* BFS_SCHED_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1c1d2a0..5699f2d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -67,12 +67,10 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
-#include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
 
@@ -125,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
 
 #undef SCHED_FEAT
 
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled)	\
-	#name ,
-
-static const char * const sched_feat_names[] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static int sched_feat_show(struct seq_file *m, void *v)
-{
-	int i;
-
-	for (i = 0; i < __SCHED_FEAT_NR; i++) {
-		if (!(sysctl_sched_features & (1UL << i)))
-			seq_puts(m, "NO_");
-		seq_printf(m, "%s ", sched_feat_names[i]);
-	}
-	seq_puts(m, "\n");
-
-	return 0;
-}
-
-#ifdef HAVE_JUMP_LABEL
-
-#define jump_label_key__true  STATIC_KEY_INIT_TRUE
-#define jump_label_key__false STATIC_KEY_INIT_FALSE
-
-#define SCHED_FEAT(name, enabled)	\
-	jump_label_key__##enabled ,
-
-struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static void sched_feat_disable(int i)
-{
-	static_key_disable(&sched_feat_keys[i]);
-}
-
-static void sched_feat_enable(int i)
-{
-	static_key_enable(&sched_feat_keys[i]);
-}
-#else
-static void sched_feat_disable(int i) { };
-static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
-
-static int sched_feat_set(char *cmp)
-{
-	int i;
-	int neg = 0;
-
-	if (strncmp(cmp, "NO_", 3) == 0) {
-		neg = 1;
-		cmp += 3;
-	}
-
-	for (i = 0; i < __SCHED_FEAT_NR; i++) {
-		if (strcmp(cmp, sched_feat_names[i]) == 0) {
-			if (neg) {
-				sysctl_sched_features &= ~(1UL << i);
-				sched_feat_disable(i);
-			} else {
-				sysctl_sched_features |= (1UL << i);
-				sched_feat_enable(i);
-			}
-			break;
-		}
-	}
-
-	return i;
-}
-
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
-		size_t cnt, loff_t *ppos)
-{
-	char buf[64];
-	char *cmp;
-	int i;
-	struct inode *inode;
-
-	if (cnt > 63)
-		cnt = 63;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-	cmp = strstrip(buf);
-
-	/* Ensure the static_key remains in a consistent state */
-	inode = file_inode(filp);
-	inode_lock(inode);
-	i = sched_feat_set(cmp);
-	inode_unlock(inode);
-	if (i == __SCHED_FEAT_NR)
-		return -EINVAL;
-
-	*ppos += cnt;
-
-	return cnt;
-}
-
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, sched_feat_show, NULL);
-}
-
-static const struct file_operations sched_feat_fops = {
-	.open		= sched_feat_open,
-	.write		= sched_feat_write,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static __init int sched_init_debug(void)
-{
-	debugfs_create_file("sched_features", 0644, NULL, NULL,
-			&sched_feat_fops);
-
-	return 0;
-}
-late_initcall(sched_init_debug);
-#endif /* CONFIG_SCHED_DEBUG */
-
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
@@ -2094,7 +1960,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
 	ttwu_queue(p, cpu);
 stat:
-	ttwu_stat(p, cpu, wake_flags);
+	if (schedstat_enabled())
+		ttwu_stat(p, cpu, wake_flags);
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -2142,7 +2009,8 @@ static void try_to_wake_up_local(struct task_struct *p)
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
 	ttwu_do_wakeup(rq, p, 0);
-	ttwu_stat(p, smp_processor_id(), 0);
+	if (schedstat_enabled())
+		ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);
 }
@@ -2184,7 +2052,6 @@ void __dl_clear_params(struct task_struct *p)
 	dl_se->dl_bw = 0;
 
 	dl_se->dl_throttled = 0;
-	dl_se->dl_new = 1;
 	dl_se->dl_yielded = 0;
 }
 
@@ -2211,6 +2078,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
+	/* Even if schedstat is disabled, there should not be garbage */
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
@@ -2219,6 +2087,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	__dl_clear_params(p);
 
 	INIT_LIST_HEAD(&p->rt.run_list);
+	p->rt.timeout		= 0;
+	p->rt.time_slice	= sched_rr_timeslice;
+	p->rt.on_rq		= 0;
+	p->rt.on_list		= 0;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2282,6 +2154,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 #endif
 #endif
 
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+static void set_schedstats(bool enabled)
+{
+	if (enabled)
+		static_branch_enable(&sched_schedstats);
+	else
+		static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+	if (!schedstat_enabled()) {
+		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+		static_branch_enable(&sched_schedstats);
+	}
+}
+
+static int __init setup_schedstats(char *str)
+{
+	int ret = 0;
+	if (!str)
+		goto out;
+
+	if (!strcmp(str, "enable")) {
+		set_schedstats(true);
+		ret = 1;
+	} else if (!strcmp(str, "disable")) {
+		set_schedstats(false);
+		ret = 1;
+	}
+out:
+	if (!ret)
+		pr_warn("Unable to parse schedstats=\n");
+
+	return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&sched_schedstats);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write)
+		set_schedstats(state);
+	return err;
+}
+#endif
+#endif
+
 /*
  * fork()/clone()-time setup:
  */
@@ -3011,16 +2946,6 @@ u64 scheduler_tick_max_deferment(void)
 }
 #endif
 
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
-	if (in_lock_functions(addr)) {
-		addr = CALLER_ADDR2;
-		if (in_lock_functions(addr))
-			addr = CALLER_ADDR3;
-	}
-	return addr;
-}
-
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
 
@@ -3042,7 +2967,7 @@ void preempt_count_add(int val)
 				PREEMPT_MASK - 10);
 #endif
 	if (preempt_count() == val) {
-		unsigned long ip = get_parent_ip(CALLER_ADDR1);
+		unsigned long ip = get_lock_parent_ip();
 #ifdef CONFIG_DEBUG_PREEMPT
 		current->preempt_disable_ip = ip;
 #endif
@@ -3069,7 +2994,7 @@ void preempt_count_sub(int val)
 #endif
 
 	if (preempt_count() == val)
-		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
 	__preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
@@ -3281,7 +3206,6 @@ static void __sched notrace __schedule(bool preempt)
 
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
-		cpu = cpu_of(rq);
 	} else {
 		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock_irq(&rq->lock);
@@ -3467,7 +3391,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -3495,11 +3419,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
+
+	if (oldprio == prio)
+		queue_flag &= ~DEQUEUE_MOVE;
+
 	prev_class = p->sched_class;
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
+		dequeue_task(rq, p, queue_flag);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3517,7 +3445,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag |= ENQUEUE_REPLENISH;
+			queue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3525,7 +3453,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag |= ENQUEUE_HEAD;
+			queue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3540,7 +3468,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, enqueue_flag);
+		enqueue_task(rq, p, queue_flag);
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -3896,6 +3824,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	const struct sched_class *prev_class;
 	struct rq *rq;
 	int reset_on_fork;
+	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -4078,17 +4007,14 @@ change:
 		 * itself.
 		 */
 		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-		if (new_effective_prio == oldprio) {
-			__setscheduler_params(p, attr);
-			task_rq_unlock(rq, p, &flags);
-			return 0;
-		}
+		if (new_effective_prio == oldprio)
+			queue_flags &= ~DEQUEUE_MOVE;
 	}
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
+		dequeue_task(rq, p, queue_flags);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -4098,15 +4024,14 @@ change:
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
-		int enqueue_flags = ENQUEUE_RESTORE;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		if (oldprio <= p->prio)
-			enqueue_flags |= ENQUEUE_HEAD;
+		if (oldprio < p->prio)
+			queue_flags |= ENQUEUE_HEAD;
 
-		enqueue_task(rq, p, enqueue_flags);
+		enqueue_task(rq, p, queue_flags);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -5408,183 +5333,6 @@ static void migrate_tasks(struct rq *dead_rq)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
-	{
-		.procname	= "sched_domain",
-		.mode		= 0555,
-	},
-	{}
-};
-
-static struct ctl_table sd_ctl_root[] = {
-	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= sd_ctl_dir,
-	},
-	{}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-	struct ctl_table *entry =
-		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
-	return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-	struct ctl_table *entry;
-
-	/*
-	 * In the intermediate directories, both the child directory and
-	 * procname are dynamically allocated and could fail but the mode
-	 * will always be set. In the lowest directory the names are
-	 * static strings and all have proc handlers.
-	 */
-	for (entry = *tablep; entry->mode; entry++) {
-		if (entry->child)
-			sd_free_ctl_entry(&entry->child);
-		if (entry->proc_handler == NULL)
-			kfree(entry->procname);
-	}
-
-	kfree(*tablep);
-	*tablep = NULL;
-}
-
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
-static void
-set_table_entry(struct ctl_table *entry,
-		const char *procname, void *data, int maxlen,
-		umode_t mode, proc_handler *proc_handler,
-		bool load_idx)
-{
-	entry->procname = procname;
-	entry->data = data;
-	entry->maxlen = maxlen;
-	entry->mode = mode;
-	entry->proc_handler = proc_handler;
-
-	if (load_idx) {
-		entry->extra1 = &min_load_idx;
-		entry->extra2 = &max_load_idx;
-	}
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
-	struct ctl_table *table = sd_alloc_ctl_entry(14);
-
-	if (table == NULL)
-		return NULL;
-
-	set_table_entry(&table[0], "min_interval", &sd->min_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[1], "max_interval", &sd->max_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[9], "cache_nice_tries",
-		&sd->cache_nice_tries,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[10], "flags", &sd->flags,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[11], "max_newidle_lb_cost",
-		&sd->max_newidle_lb_cost,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[12], "name", sd->name,
-		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-	/* &table[13] is terminator */
-
-	return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
-	struct ctl_table *entry, *table;
-	struct sched_domain *sd;
-	int domain_num = 0, i;
-	char buf[32];
-
-	for_each_domain(cpu, sd)
-		domain_num++;
-	entry = table = sd_alloc_ctl_entry(domain_num + 1);
-	if (table == NULL)
-		return NULL;
-
-	i = 0;
-	for_each_domain(cpu, sd) {
-		snprintf(buf, 32, "domain%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_domain_table(sd);
-		entry++;
-		i++;
-	}
-	return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
-	int i, cpu_num = num_possible_cpus();
-	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
-	char buf[32];
-
-	WARN_ON(sd_ctl_dir[0].child);
-	sd_ctl_dir[0].child = entry;
-
-	if (entry == NULL)
-		return;
-
-	for_each_possible_cpu(i) {
-		snprintf(buf, 32, "cpu%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_cpu_table(i);
-		entry++;
-	}
-
-	WARN_ON(sd_sysctl_header);
-	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
-	unregister_sysctl_table(sd_sysctl_header);
-	sd_sysctl_header = NULL;
-	if (sd_ctl_dir[0].child)
-		sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
-
 static void set_rq_online(struct rq *rq)
 {
 	if (!rq->online) {
@@ -6177,11 +5925,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
+	int ret;
+
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
-	cpulist_parse(str, cpu_isolated_map);
+	ret = cpulist_parse(str, cpu_isolated_map);
+	if (ret) {
+		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+		return 0;
+	}
 	return 1;
 }
-
 __setup("isolcpus=", isolated_cpu_setup);
 
 struct s_data {
@@ -7863,11 +7616,9 @@ void sched_destroy_group(struct task_group *tg)
 void sched_offline_group(struct task_group *tg)
 {
 	unsigned long flags;
-	int i;
 
 	/* end participation in shares distribution */
-	for_each_possible_cpu(i)
-		unregister_fair_sched_group(tg, i);
+	unregister_fair_sched_group(tg);
 
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_del_rcu(&tg->list);
@@ -7893,7 +7644,7 @@ void sched_move_task(struct task_struct *tsk)
 	queued = task_on_rq_queued(tsk);
 
 	if (queued)
-		dequeue_task(rq, tsk, DEQUEUE_SAVE);
+		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -7917,7 +7668,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
 
 	task_rq_unlock(rq, tsk, &flags);
 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ab2b5fb..75f98c5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static unsigned long long vtime_delta(struct task_struct *tsk)
+static cputime_t vtime_delta(struct task_struct *tsk)
 {
-	unsigned long long clock;
+	unsigned long now = READ_ONCE(jiffies);
 
-	clock = local_clock();
-	if (clock < tsk->vtime_snap)
+	if (time_before(now, (unsigned long)tsk->vtime_snap))
 		return 0;
 
-	return clock - tsk->vtime_snap;
+	return jiffies_to_cputime(now - tsk->vtime_snap);
 }
 
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
-	unsigned long long delta = vtime_delta(tsk);
+	unsigned long now = READ_ONCE(jiffies);
+	unsigned long delta = now - tsk->vtime_snap;
 
 	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-	tsk->vtime_snap += delta;
+	tsk->vtime_snap = now;
 
-	/* CHECKME: always safe to convert nsecs to cputime? */
-	return nsecs_to_cputime(delta);
+	return jiffies_to_cputime(delta);
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
@@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
+	if (!vtime_delta(tsk))
+		return;
+
 	write_seqcount_begin(&tsk->vtime_seqcount);
 	__vtime_account_system(tsk);
 	write_seqcount_end(&tsk->vtime_seqcount);
@@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk)
 void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	__vtime_account_system(tsk);
+	if (vtime_delta(tsk))
+		__vtime_account_system(tsk);
 	if (context_tracking_in_user())
 		tsk->vtime_snap_whence = VTIME_USER;
 	write_seqcount_end(&tsk->vtime_seqcount);
@@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk)
 	cputime_t delta_cpu;
 
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	delta_cpu = get_vtime_delta(tsk);
 	tsk->vtime_snap_whence = VTIME_SYS;
-	account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+	if (vtime_delta(tsk)) {
+		delta_cpu = get_vtime_delta(tsk);
+		account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+	}
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	__vtime_account_system(tsk);
+	if (vtime_delta(tsk))
+		__vtime_account_system(tsk);
 	tsk->vtime_snap_whence = VTIME_USER;
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk)
 	 * that can thus safely catch up with a tickless delta.
 	 */
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	__vtime_account_system(tsk);
+	if (vtime_delta(tsk))
+		__vtime_account_system(tsk);
 	current->flags |= PF_VCPU;
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
 
 	write_seqcount_begin(&current->vtime_seqcount);
 	current->vtime_snap_whence = VTIME_SYS;
-	current->vtime_snap = sched_clock_cpu(smp_processor_id());
+	current->vtime_snap = jiffies;
 	write_seqcount_end(&current->vtime_seqcount);
 }
 
@@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 	local_irq_save(flags);
 	write_seqcount_begin(&t->vtime_seqcount);
 	t->vtime_snap_whence = VTIME_SYS;
-	t->vtime_snap = sched_clock_cpu(cpu);
+	t->vtime_snap = jiffies;
 	write_seqcount_end(&t->vtime_seqcount);
 	local_irq_restore(flags);
 }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 57b939c..c7a036f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+	/*
+	 * We are racing with the deadline timer. So, do nothing because
+	 * the deadline timer handler will take care of properly recharging
+	 * the runtime and postponing the deadline
+	 */
+	if (dl_se->dl_throttled)
+		return;
 
 	/*
 	 * We use the regular wall clock time to set deadlines in the
@@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
 	 */
 	dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
 	dl_se->runtime = pi_se->dl_runtime;
-	dl_se->dl_new = 0;
 }
 
 /*
@@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
 		dl_se->runtime = pi_se->dl_runtime;
 	}
 
+	if (dl_se->dl_yielded && dl_se->runtime > 0)
+		dl_se->runtime = 0;
+
 	/*
 	 * We keep moving the deadline away until we get some
 	 * available runtime for the entity. This ensures correct
@@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	/*
-	 * The arrival of a new instance needs special treatment, i.e.,
-	 * the actual scheduling parameters have to be "renewed".
-	 */
-	if (dl_se->dl_new) {
-		setup_new_dl_entity(dl_se, pi_se);
-		return;
-	}
-
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
 	    dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
 		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@@ -605,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	}
 
 	/*
-	 * This is possible if switched_from_dl() raced against a running
-	 * callback that took the above !dl_task() path and we've since then
-	 * switched back into SCHED_DEADLINE.
-	 *
-	 * There's nothing to do except drop our task reference.
-	 */
-	if (dl_se->dl_new)
-		goto unlock;
-
-	/*
 	 * The task might have been boosted by someone else and might be in the
 	 * boosting/deboosting path, its not throttled.
 	 */
@@ -735,8 +726,11 @@ static void update_curr_dl(struct rq *rq)
 	 * approach need further study.
 	 */
 	delta_exec = rq_clock_task(rq) - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0))
+	if (unlikely((s64)delta_exec <= 0)) {
+		if (unlikely(dl_se->dl_yielded))
+			goto throttle;
 		return;
+	}
 
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
@@ -749,8 +743,10 @@ static void update_curr_dl(struct rq *rq)
 
 	sched_rt_avg_update(rq, delta_exec);
 
-	dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
-	if (dl_runtime_exceeded(dl_se)) {
+	dl_se->runtime -= delta_exec;
+
+throttle:
+	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
 		dl_se->dl_throttled = 1;
 		__dequeue_task_dl(rq, curr, 0);
 		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@@ -917,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 	 * parameters of the task might need updating. Otherwise,
 	 * we want a replenishment of its runtime.
 	 */
-	if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
+	if (flags & ENQUEUE_WAKEUP)
 		update_dl_entity(dl_se, pi_se);
 	else if (flags & ENQUEUE_REPLENISH)
 		replenish_dl_entity(dl_se, pi_se);
@@ -994,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
  */
 static void yield_task_dl(struct rq *rq)
 {
-	struct task_struct *p = rq->curr;
-
 	/*
 	 * We make the task go to sleep until its current deadline by
 	 * forcing its runtime to zero. This way, update_curr_dl() stops
 	 * it and the bandwidth timer will wake it up and will give it
 	 * new scheduling parameters (thanks to dl_yielded=1).
 	 */
-	if (p->dl.runtime > 0) {
-		rq->curr->dl.dl_yielded = 1;
-		p->dl.runtime = 0;
-	}
+	rq->curr->dl.dl_yielded = 1;
+
 	update_rq_clock(rq);
 	update_curr_dl(rq);
 	/*
@@ -1722,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
+	if (dl_time_before(p->dl.deadline, rq_clock(rq)))
+		setup_new_dl_entity(&p->dl, &p->dl);
+
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
@@ -1768,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 		 */
 		resched_curr(rq);
 #endif /* CONFIG_SMP */
-	} else
-		switched_to_dl(rq, p);
+	}
 }
 
 const struct sched_class dl_sched_class = {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6415117..4fbc3bd 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -16,6 +16,7 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
 #include <linux/mempolicy.h>
+#include <linux/debugfs.h>
 
 #include "sched.h"
 
@@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)
 
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
+#define SCHED_FEAT(name, enabled)	\
+	#name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	for (i = 0; i < __SCHED_FEAT_NR; i++) {
+		if (!(sysctl_sched_features & (1UL << i)))
+			seq_puts(m, "NO_");
+		seq_printf(m, "%s ", sched_feat_names[i]);
+	}
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled)	\
+	jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+	static_key_disable(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+	static_key_enable(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+	int i;
+	int neg = 0;
+
+	if (strncmp(cmp, "NO_", 3) == 0) {
+		neg = 1;
+		cmp += 3;
+	}
+
+	for (i = 0; i < __SCHED_FEAT_NR; i++) {
+		if (strcmp(cmp, sched_feat_names[i]) == 0) {
+			if (neg) {
+				sysctl_sched_features &= ~(1UL << i);
+				sched_feat_disable(i);
+			} else {
+				sysctl_sched_features |= (1UL << i);
+				sched_feat_enable(i);
+			}
+			break;
+		}
+	}
+
+	return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+		size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	char *cmp;
+	int i;
+	struct inode *inode;
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+	cmp = strstrip(buf);
+
+	/* Ensure the static_key remains in a consistent state */
+	inode = file_inode(filp);
+	inode_lock(inode);
+	i = sched_feat_set(cmp);
+	inode_unlock(inode);
+	if (i == __SCHED_FEAT_NR)
+		return -EINVAL;
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+	.open		= sched_feat_open,
+	.write		= sched_feat_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static __init int sched_init_debug(void)
+{
+	debugfs_create_file("sched_features", 0644, NULL, NULL,
+			&sched_feat_fops);
+
+	return 0;
+}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table sd_ctl_dir[] = {
+	{
+		.procname	= "sched_domain",
+		.mode		= 0555,
+	},
+	{}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= sd_ctl_dir,
+	},
+	{}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+	struct ctl_table *entry =
+		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+	return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+	struct ctl_table *entry;
+
+	/*
+	 * In the intermediate directories, both the child directory and
+	 * procname are dynamically allocated and could fail but the mode
+	 * will always be set. In the lowest directory the names are
+	 * static strings and all have proc handlers.
+	 */
+	for (entry = *tablep; entry->mode; entry++) {
+		if (entry->child)
+			sd_free_ctl_entry(&entry->child);
+		if (entry->proc_handler == NULL)
+			kfree(entry->procname);
+	}
+
+	kfree(*tablep);
+	*tablep = NULL;
+}
+
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+		const char *procname, void *data, int maxlen,
+		umode_t mode, proc_handler *proc_handler,
+		bool load_idx)
+{
+	entry->procname = procname;
+	entry->data = data;
+	entry->maxlen = maxlen;
+	entry->mode = mode;
+	entry->proc_handler = proc_handler;
+
+	if (load_idx) {
+		entry->extra1 = &min_load_idx;
+		entry->extra2 = &max_load_idx;
+	}
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+	if (table == NULL)
+		return NULL;
+
+	set_table_entry(&table[0], "min_interval", &sd->min_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[1], "max_interval", &sd->max_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[9], "cache_nice_tries",
+		&sd->cache_nice_tries,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[10], "flags", &sd->flags,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[11], "max_newidle_lb_cost",
+		&sd->max_newidle_lb_cost,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[12], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+	/* &table[13] is terminator */
+
+	return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+	struct ctl_table *entry, *table;
+	struct sched_domain *sd;
+	int domain_num = 0, i;
+	char buf[32];
+
+	for_each_domain(cpu, sd)
+		domain_num++;
+	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+	if (table == NULL)
+		return NULL;
+
+	i = 0;
+	for_each_domain(cpu, sd) {
+		snprintf(buf, 32, "domain%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_domain_table(sd);
+		entry++;
+		i++;
+	}
+	return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+	int i, cpu_num = num_possible_cpus();
+	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	char buf[32];
+
+	WARN_ON(sd_ctl_dir[0].child);
+	sd_ctl_dir[0].child = entry;
+
+	if (entry == NULL)
+		return;
+
+	for_each_possible_cpu(i) {
+		snprintf(buf, 32, "cpu%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_cpu_table(i);
+		entry++;
+	}
+
+	WARN_ON(sd_sysctl_header);
+	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+	unregister_sysctl_table(sd_sysctl_header);
+	sd_sysctl_header = NULL;
+	if (sd_ctl_dir[0].child)
+		sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
@@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-	PN(se->statistics.wait_start);
-	PN(se->statistics.sleep_start);
-	PN(se->statistics.block_start);
-	PN(se->statistics.sleep_max);
-	PN(se->statistics.block_max);
-	PN(se->statistics.exec_max);
-	PN(se->statistics.slice_max);
-	PN(se->statistics.wait_max);
-	PN(se->statistics.wait_sum);
-	P(se->statistics.wait_count);
+	if (schedstat_enabled()) {
+		PN(se->statistics.wait_start);
+		PN(se->statistics.sleep_start);
+		PN(se->statistics.block_start);
+		PN(se->statistics.sleep_max);
+		PN(se->statistics.block_max);
+		PN(se->statistics.exec_max);
+		PN(se->statistics.slice_max);
+		PN(se->statistics.wait_max);
+		PN(se->statistics.wait_sum);
+		P(se->statistics.wait_count);
+	}
 #endif
 	P(se->load.weight);
 #ifdef CONFIG_SMP
@@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-		SPLIT_NS(p->se.statistics.wait_sum),
-		SPLIT_NS(p->se.sum_exec_runtime),
-		SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+	if (schedstat_enabled()) {
+		SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+			SPLIT_NS(p->se.statistics.wait_sum),
+			SPLIT_NS(p->se.sum_exec_runtime),
+			SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+	}
 #else
 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
 		0LL, 0L,
@@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 
 void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 {
+	struct dl_bw *dl_bw;
+
 	SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
 	SEQ_printf(m, "  .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+#ifdef CONFIG_SMP
+	dl_bw = &cpu_rq(cpu)->rd->dl_bw;
+#else
+	dl_bw = &dl_rq->dl_bw;
+#endif
+	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
+	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
 }
 
 extern __read_mostly int sched_clock_running;
@@ -313,17 +630,18 @@ do {									\
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
 #define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 
-	P(yld_count);
-
-	P(sched_count);
-	P(sched_goidle);
 #ifdef CONFIG_SMP
 	P64(avg_idle);
 	P64(max_idle_balance_cost);
 #endif
 
-	P(ttwu_count);
-	P(ttwu_local);
+	if (schedstat_enabled()) {
+		P(yld_count);
+		P(sched_count);
+		P(sched_goidle);
+		P(ttwu_count);
+		P(ttwu_local);
+	}
 
 #undef P
 #undef P64
@@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
-	PN(se.statistics.sum_sleep_runtime);
-	PN(se.statistics.wait_start);
-	PN(se.statistics.sleep_start);
-	PN(se.statistics.block_start);
-	PN(se.statistics.sleep_max);
-	PN(se.statistics.block_max);
-	PN(se.statistics.exec_max);
-	PN(se.statistics.slice_max);
-	PN(se.statistics.wait_max);
-	PN(se.statistics.wait_sum);
-	P(se.statistics.wait_count);
-	PN(se.statistics.iowait_sum);
-	P(se.statistics.iowait_count);
 	P(se.nr_migrations);
-	P(se.statistics.nr_migrations_cold);
-	P(se.statistics.nr_failed_migrations_affine);
-	P(se.statistics.nr_failed_migrations_running);
-	P(se.statistics.nr_failed_migrations_hot);
-	P(se.statistics.nr_forced_migrations);
-	P(se.statistics.nr_wakeups);
-	P(se.statistics.nr_wakeups_sync);
-	P(se.statistics.nr_wakeups_migrate);
-	P(se.statistics.nr_wakeups_local);
-	P(se.statistics.nr_wakeups_remote);
-	P(se.statistics.nr_wakeups_affine);
-	P(se.statistics.nr_wakeups_affine_attempts);
-	P(se.statistics.nr_wakeups_passive);
-	P(se.statistics.nr_wakeups_idle);
 
-	{
+	if (schedstat_enabled()) {
 		u64 avg_atom, avg_per_cpu;
 
+		PN(se.statistics.sum_sleep_runtime);
+		PN(se.statistics.wait_start);
+		PN(se.statistics.sleep_start);
+		PN(se.statistics.block_start);
+		PN(se.statistics.sleep_max);
+		PN(se.statistics.block_max);
+		PN(se.statistics.exec_max);
+		PN(se.statistics.slice_max);
+		PN(se.statistics.wait_max);
+		PN(se.statistics.wait_sum);
+		P(se.statistics.wait_count);
+		PN(se.statistics.iowait_sum);
+		P(se.statistics.iowait_count);
+		P(se.statistics.nr_migrations_cold);
+		P(se.statistics.nr_failed_migrations_affine);
+		P(se.statistics.nr_failed_migrations_running);
+		P(se.statistics.nr_failed_migrations_hot);
+		P(se.statistics.nr_forced_migrations);
+		P(se.statistics.nr_wakeups);
+		P(se.statistics.nr_wakeups_sync);
+		P(se.statistics.nr_wakeups_migrate);
+		P(se.statistics.nr_wakeups_local);
+		P(se.statistics.nr_wakeups_remote);
+		P(se.statistics.nr_wakeups_affine);
+		P(se.statistics.nr_wakeups_affine_attempts);
+		P(se.statistics.nr_wakeups_passive);
+		P(se.statistics.nr_wakeups_idle);
+
 		avg_atom = p->se.sum_exec_runtime;
 		if (nr_switches)
 			avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index adff850..ac7fb39 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,8 +20,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 
-#include <linux/latencytop.h>
 #include <linux/sched.h>
+#include <linux/latencytop.h>
 #include <linux/cpumask.h>
 #include <linux/cpuidle.h>
 #include <linux/slab.h>
@@ -47,8 +47,13 @@
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_latency = 3000000ULL;
+unsigned int normalized_sysctl_sched_latency = 3000000ULL;
+#else
 unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+#endif
 
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -66,13 +71,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_min_granularity = 300000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 300000ULL;
+#else
 unsigned int sysctl_sched_min_granularity = 750000ULL;
 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+#endif
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+static unsigned int sched_nr_latency = 10;
+#else
 static unsigned int sched_nr_latency = 8;
+#endif
 
 /*
  * After fork, child runs first. If set to 0 (default) then
@@ -88,10 +102,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_wakeup_granularity = 500000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL;
+
+const_debug unsigned int sysctl_sched_migration_cost = 250000UL;
+#else
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+#endif
 
 /*
  * The exponential sliding  window over which load is averaged for shares
@@ -111,8 +132,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
  *
  * default: 5 msec, units: microseconds
   */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL;
+#else
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
+#endif
 
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
@@ -755,7 +780,9 @@ static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct task_struct *p;
-	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+	u64 delta;
+
+	delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
 
 	if (entity_is_task(se)) {
 		p = task_of(se);
@@ -776,22 +803,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->statistics.wait_sum += delta;
 	se->statistics.wait_start = 0;
 }
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-#endif
 
 /*
  * Task is being enqueued - update stats:
  */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/*
 	 * Are we enqueueing a waiting task? (for current tasks
@@ -802,7 +819,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	/*
 	 * Mark the end of the wait period if dequeueing a
@@ -810,7 +827,40 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 */
 	if (se != cfs_rq->curr)
 		update_stats_wait_end(cfs_rq, se);
+
+	if (flags & DEQUEUE_SLEEP) {
+		if (entity_is_task(se)) {
+			struct task_struct *tsk = task_of(se);
+
+			if (tsk->state & TASK_INTERRUPTIBLE)
+				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+			if (tsk->state & TASK_UNINTERRUPTIBLE)
+				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+		}
+	}
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
 }
+#endif
 
 /*
  * We are picking a new current task - update its stats:
@@ -907,10 +957,11 @@ struct numa_group {
 	spinlock_t lock; /* nr_tasks, tasks */
 	int nr_tasks;
 	pid_t gid;
+	int active_nodes;
 
 	struct rcu_head rcu;
-	nodemask_t active_nodes;
 	unsigned long total_faults;
+	unsigned long max_faults_cpu;
 	/*
 	 * Faults_cpu is used to decide whether memory should move
 	 * towards the CPU. As a consequence, these stats are weighted
@@ -969,6 +1020,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
 }
 
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
 /* Handle placement on systems where not all nodes are directly connected. */
 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 					int maxdist, bool task)
@@ -1118,27 +1181,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 		return true;
 
 	/*
-	 * Do not migrate if the destination is not a node that
-	 * is actively used by this numa group.
-	 */
-	if (!node_isset(dst_nid, ng->active_nodes))
-		return false;
-
-	/*
-	 * Source is a node that is not actively used by this
-	 * numa group, while the destination is. Migrate.
+	 * Destination node is much more heavily used than the source
+	 * node? Allow migration.
 	 */
-	if (!node_isset(src_nid, ng->active_nodes))
+	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+					ACTIVE_NODE_FRACTION)
 		return true;
 
 	/*
-	 * Both source and destination are nodes in active
-	 * use by this numa group. Maximize memory bandwidth
-	 * by migrating from more heavily used groups, to less
-	 * heavily used ones, spreading the load around.
-	 * Use a 1/4 hysteresis to avoid spurious page movement.
+	 * Distribute memory according to CPU & memory use on each node,
+	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
+	 *
+	 * faults_cpu(dst)   3   faults_cpu(src)
+	 * --------------- * - > ---------------
+	 * faults_mem(dst)   4   faults_mem(src)
 	 */
-	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
 static unsigned long weighted_cpuload(const int cpu);
@@ -1484,7 +1543,7 @@ static int task_numa_migrate(struct task_struct *p)
 
 		.best_task = NULL,
 		.best_imp = 0,
-		.best_cpu = -1
+		.best_cpu = -1,
 	};
 	struct sched_domain *sd;
 	unsigned long taskweight, groupweight;
@@ -1536,8 +1595,7 @@ static int task_numa_migrate(struct task_struct *p)
 	 *   multiple NUMA nodes; in order to better consolidate the group,
 	 *   we need to check other locations.
 	 */
-	if (env.best_cpu == -1 || (p->numa_group &&
-			nodes_weight(p->numa_group->active_nodes) > 1)) {
+	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
 		for_each_online_node(nid) {
 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
 				continue;
@@ -1572,12 +1630,14 @@ static int task_numa_migrate(struct task_struct *p)
 	 * trying for a better one later. Do not set the preferred node here.
 	 */
 	if (p->numa_group) {
+		struct numa_group *ng = p->numa_group;
+
 		if (env.best_cpu == -1)
 			nid = env.src_nid;
 		else
 			nid = env.dst_nid;
 
-		if (node_isset(nid, p->numa_group->active_nodes))
+		if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
 			sched_setnuma(p, env.dst_nid);
 	}
 
@@ -1627,20 +1687,15 @@ static void numa_migrate_preferred(struct task_struct *p)
 }
 
 /*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes on the workload is actively running on. Do this by
  * tracking the nodes from which NUMA hinting faults are triggered. This can
  * be different from the set of nodes where the workload's memory is currently
  * located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
  */
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
 {
 	unsigned long faults, max_faults = 0;
-	int nid;
+	int nid, active_nodes = 0;
 
 	for_each_online_node(nid) {
 		faults = group_faults_cpu(numa_group, nid);
@@ -1650,12 +1705,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
 
 	for_each_online_node(nid) {
 		faults = group_faults_cpu(numa_group, nid);
-		if (!node_isset(nid, numa_group->active_nodes)) {
-			if (faults > max_faults * 6 / 16)
-				node_set(nid, numa_group->active_nodes);
-		} else if (faults < max_faults * 3 / 16)
-			node_clear(nid, numa_group->active_nodes);
+		if (faults * ACTIVE_NODE_FRACTION > max_faults)
+			active_nodes++;
 	}
+
+	numa_group->max_faults_cpu = max_faults;
+	numa_group->active_nodes = active_nodes;
 }
 
 /*
@@ -1946,7 +2001,7 @@ static void task_numa_placement(struct task_struct *p)
 	update_task_scan_period(p, fault_types[0], fault_types[1]);
 
 	if (p->numa_group) {
-		update_numa_active_node_mask(p->numa_group);
+		numa_group_count_active_nodes(p->numa_group);
 		spin_unlock_irq(group_lock);
 		max_nid = preferred_group_nid(p, max_group_nid);
 	}
@@ -1990,14 +2045,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 			return;
 
 		atomic_set(&grp->refcount, 1);
+		grp->active_nodes = 1;
+		grp->max_faults_cpu = 0;
 		spin_lock_init(&grp->lock);
 		grp->gid = p->pid;
 		/* Second half of the array tracks nids where faults happen */
 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
 						nr_node_ids;
 
-		node_set(task_node(current), grp->active_nodes);
-
 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 			grp->faults[i] = p->numa_faults[i];
 
@@ -2111,6 +2166,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	bool migrated = flags & TNF_MIGRATED;
 	int cpu_node = task_node(current);
 	int local = !!(flags & TNF_FAULT_LOCAL);
+	struct numa_group *ng;
 	int priv;
 
 	if (!static_branch_likely(&sched_numa_balancing))
@@ -2151,9 +2207,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	 * actively using should be counted as local. This allows the
 	 * scan rate to slow down when a workload has settled down.
 	 */
-	if (!priv && !local && p->numa_group &&
-			node_isset(cpu_node, p->numa_group->active_nodes) &&
-			node_isset(mem_node, p->numa_group->active_nodes))
+	ng = p->numa_group;
+	if (!priv && !local && ng && ng->active_nodes > 1 &&
+				numa_is_active_node(cpu_node, ng) &&
+				numa_is_active_node(mem_node, ng))
 		local = 1;
 
 	task_numa_placement(p);
@@ -3102,6 +3159,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+	if (schedstat_enabled())
+		return;
+
+	/* Force schedstat enabled if a dependent tracepoint is active */
+	if (trace_sched_stat_wait_enabled()    ||
+			trace_sched_stat_sleep_enabled()   ||
+			trace_sched_stat_iowait_enabled()  ||
+			trace_sched_stat_blocked_enabled() ||
+			trace_sched_stat_runtime_enabled())  {
+		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+			     "stat_blocked and stat_runtime require the "
+			     "kernel parameter schedstats=enabled or "
+			     "kernel.sched_schedstats=1\n");
+	}
+#endif
+}
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -3122,11 +3199,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
-		enqueue_sleeper(cfs_rq, se);
+		if (schedstat_enabled())
+			enqueue_sleeper(cfs_rq, se);
 	}
 
-	update_stats_enqueue(cfs_rq, se);
-	check_spread(cfs_rq, se);
+	check_schedstat_required();
+	if (schedstat_enabled()) {
+		update_stats_enqueue(cfs_rq, se);
+		check_spread(cfs_rq, se);
+	}
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
@@ -3193,19 +3274,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_curr(cfs_rq);
 	dequeue_entity_load_avg(cfs_rq, se);
 
-	update_stats_dequeue(cfs_rq, se);
-	if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-		if (entity_is_task(se)) {
-			struct task_struct *tsk = task_of(se);
-
-			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
-			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
-		}
-#endif
-	}
+	if (schedstat_enabled())
+		update_stats_dequeue(cfs_rq, se, flags);
 
 	clear_buddies(cfs_rq, se);
 
@@ -3279,7 +3349,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * a CPU. So account for the time it spent waiting on the
 		 * runqueue.
 		 */
-		update_stats_wait_end(cfs_rq, se);
+		if (schedstat_enabled())
+			update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
 		update_load_avg(se, 1);
 	}
@@ -3292,7 +3363,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * least twice that of our own weight (i.e. dont track it
 	 * when there are only lesser-weight tasks around):
 	 */
-	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
 		se->statistics.slice_max = max(se->statistics.slice_max,
 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
 	}
@@ -3375,9 +3446,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	/* throttle cfs_rqs exceeding runtime */
 	check_cfs_rq_runtime(cfs_rq);
 
-	check_spread(cfs_rq, prev);
+	if (schedstat_enabled()) {
+		check_spread(cfs_rq, prev);
+		if (prev->on_rq)
+			update_stats_wait_start(cfs_rq, prev);
+	}
+
 	if (prev->on_rq) {
-		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
@@ -4492,6 +4567,25 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
+static void __update_cpu_load_nohz(struct rq *this_rq,
+				   unsigned long curr_jiffies,
+				   unsigned long load,
+				   int active)
+{
+	unsigned long pending_updates;
+
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	if (pending_updates) {
+		this_rq->last_load_update_tick = curr_jiffies;
+		/*
+		 * In the regular NOHZ case, we were idle, this means load 0.
+		 * In the NOHZ_FULL case, we were non-idle, we should consider
+		 * its weighted load.
+		 */
+		__update_cpu_load(this_rq, load, pending_updates, active);
+	}
+}
+
 /*
  * There is no sane way to deal with nohz on smp when using jiffies because the
  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -4509,22 +4603,15 @@ static unsigned long weighted_cpuload(const int cpu)
  * Called from nohz_idle_balance() to update the load ratings before doing the
  * idle balance.
  */
-static void update_idle_cpu_load(struct rq *this_rq)
+static void update_cpu_load_idle(struct rq *this_rq)
 {
-	unsigned long curr_jiffies = READ_ONCE(jiffies);
-	unsigned long load = weighted_cpuload(cpu_of(this_rq));
-	unsigned long pending_updates;
-
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
-	if (load || curr_jiffies == this_rq->last_load_update_tick)
+	if (weighted_cpuload(cpu_of(this_rq)))
 		return;
 
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	this_rq->last_load_update_tick = curr_jiffies;
-
-	__update_cpu_load(this_rq, load, pending_updates, 0);
+	__update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
 }
 
 /*
@@ -4535,22 +4622,12 @@ void update_cpu_load_nohz(int active)
 	struct rq *this_rq = this_rq();
 	unsigned long curr_jiffies = READ_ONCE(jiffies);
 	unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
-	unsigned long pending_updates;
 
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
 	raw_spin_lock(&this_rq->lock);
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	if (pending_updates) {
-		this_rq->last_load_update_tick = curr_jiffies;
-		/*
-		 * In the regular NOHZ case, we were idle, this means load 0.
-		 * In the NOHZ_FULL case, we were non-idle, we should consider
-		 * its weighted load.
-		 */
-		__update_cpu_load(this_rq, load, pending_updates, active);
-	}
+	__update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
 	raw_spin_unlock(&this_rq->lock);
 }
 #endif /* CONFIG_NO_HZ */
@@ -4562,7 +4639,7 @@ void update_cpu_load_active(struct rq *this_rq)
 {
 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
 	/*
-	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+	 * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
 	 */
 	this_rq->last_load_update_tick = jiffies;
 	__update_cpu_load(this_rq, load, 1, 1);
@@ -7856,7 +7933,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		if (time_after_eq(jiffies, rq->next_balance)) {
 			raw_spin_lock_irq(&rq->lock);
 			update_rq_clock(rq);
-			update_idle_cpu_load(rq);
+			update_cpu_load_idle(rq);
 			raw_spin_unlock_irq(&rq->lock);
 			rebalance_domains(rq, CPU_IDLE);
 		}
@@ -8242,11 +8319,8 @@ void free_fair_sched_group(struct task_group *tg)
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
-		if (tg->se) {
-			if (tg->se[i])
-				remove_entity_load_avg(tg->se[i]);
+		if (tg->se)
 			kfree(tg->se[i]);
-		}
 	}
 
 	kfree(tg->cfs_rq);
@@ -8294,21 +8368,29 @@ err:
 	return 0;
 }
 
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void unregister_fair_sched_group(struct task_group *tg)
 {
-	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
+	struct rq *rq;
+	int cpu;
 
-	/*
-	* Only empty task groups can be destroyed; so we can speculatively
-	* check on_list without danger of it being re-added.
-	*/
-	if (!tg->cfs_rq[cpu]->on_list)
-		return;
+	for_each_possible_cpu(cpu) {
+		if (tg->se[cpu])
+			remove_entity_load_avg(tg->se[cpu]);
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
-	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+		/*
+		 * Only empty task groups can be destroyed; so we can speculatively
+		 * check on_list without danger of it being re-added.
+		 */
+		if (!tg->cfs_rq[cpu]->on_list)
+			continue;
+
+		rq = cpu_rq(cpu);
+
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	}
 }
 
 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -8390,7 +8472,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	return 1;
 }
 
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+void unregister_fair_sched_group(struct task_group *tg) { }
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 544a713..bbf8139 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -13,7 +13,11 @@
 
 #include <trace/events/power.h>
 
+#ifdef CONFIG_SCHED_BFS
+#include "bfs_sched.h"
+#else
 #include "sched.h"
+#endif
 
 /**
  * sched_idle_set_state - Record idle state for the current CPU.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86ab..a774b4d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 	raw_spin_lock(&rt_b->rt_runtime_lock);
 	if (!rt_b->rt_period_active) {
 		rt_b->rt_period_active = 1;
-		hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+		/*
+		 * SCHED_DEADLINE updates the bandwidth, as a run away
+		 * RT task with a DL task could hog a CPU. But DL does
+		 * not reset the period. If a deadline task was running
+		 * without an RT task running, it can cause RT tasks to
+		 * throttle when they start up. Kick the timer right away
+		 * to update the period.
+		 */
+		hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
 		hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
 	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
@@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
-	return !list_empty(&rt_se->run_list);
+	return rt_se->on_rq;
 }
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 	return rt_se->my_q;
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
@@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 		if (!rt_se)
 			enqueue_top_rt_rq(rt_rq);
 		else if (!on_rt_rq(rt_se))
-			enqueue_rt_entity(rt_se, false);
+			enqueue_rt_entity(rt_se, 0);
 
 		if (rt_rq->highest_prio.curr < curr->prio)
 			resched_curr(rq);
@@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 	if (!rt_se)
 		dequeue_top_rt_rq(rt_rq);
 	else if (on_rt_rq(rt_se))
-		dequeue_rt_entity(rt_se);
+		dequeue_rt_entity(rt_se, 0);
 }
 
 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1166,7 +1174,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	dec_rt_group(rt_se, rt_rq);
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+		return false;
+
+	return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+	list_del_init(&rt_se->run_list);
+
+	if (list_empty(array->queue + rt_se_prio(rt_se)))
+		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+	rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1210,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 	 * get throttled and the current group doesn't have any other
 	 * active members.
 	 */
-	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+		if (rt_se->on_list)
+			__delist_rt_entity(rt_se, array);
 		return;
+	}
 
-	if (head)
-		list_add(&rt_se->run_list, queue);
-	else
-		list_add_tail(&rt_se->run_list, queue);
-	__set_bit(rt_se_prio(rt_se), array->bitmap);
+	if (move_entity(flags)) {
+		WARN_ON_ONCE(rt_se->on_list);
+		if (flags & ENQUEUE_HEAD)
+			list_add(&rt_se->run_list, queue);
+		else
+			list_add_tail(&rt_se->run_list, queue);
+
+		__set_bit(rt_se_prio(rt_se), array->bitmap);
+		rt_se->on_list = 1;
+	}
+	rt_se->on_rq = 1;
 
 	inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 
-	list_del_init(&rt_se->run_list);
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+	if (move_entity(flags)) {
+		WARN_ON_ONCE(!rt_se->on_list);
+		__delist_rt_entity(rt_se, array);
+	}
+	rt_se->on_rq = 0;
 
 	dec_rt_tasks(rt_se, rt_rq);
 }
@@ -1207,7 +1249,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct sched_rt_entity *back = NULL;
 
@@ -1220,31 +1262,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 
 	for (rt_se = back; rt_se; rt_se = rt_se->back) {
 		if (on_rt_rq(rt_se))
-			__dequeue_rt_entity(rt_se);
+			__dequeue_rt_entity(rt_se, flags);
 	}
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
 
-	dequeue_rt_stack(rt_se);
+	dequeue_rt_stack(rt_se, flags);
 	for_each_sched_rt_entity(rt_se)
-		__enqueue_rt_entity(rt_se, head);
+		__enqueue_rt_entity(rt_se, flags);
 	enqueue_top_rt_rq(&rq->rt);
 }
 
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
 
-	dequeue_rt_stack(rt_se);
+	dequeue_rt_stack(rt_se, flags);
 
 	for_each_sched_rt_entity(rt_se) {
 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 		if (rt_rq && rt_rq->rt_nr_running)
-			__enqueue_rt_entity(rt_se, false);
+			__enqueue_rt_entity(rt_se, flags);
 	}
 	enqueue_top_rt_rq(&rq->rt);
 }
@@ -1260,7 +1302,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
-	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+	enqueue_rt_entity(rt_se, flags);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
@@ -1271,7 +1313,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_rt_entity *rt_se = &p->rt;
 
 	update_curr_rt(rq);
-	dequeue_rt_entity(rt_se);
+	dequeue_rt_entity(rt_se, flags);
 
 	dequeue_pushable_task(rq, p);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ff87d88..60c5b6b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
+#include <linux/binfmts.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
@@ -313,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data);
 
 extern void free_fair_sched_group(struct task_group *tg);
 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
-extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void unregister_fair_sched_group(struct task_group *tg);
 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 			struct sched_entity *se, int cpu,
 			struct sched_entity *parent);
 extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
 extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@@ -909,6 +909,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
 
 extern int group_balance_cpu(struct sched_group *sg);
 
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
 #else
 
 static inline void sched_ttwu_pending(void) { }
@@ -1022,6 +1034,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
 extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
 
 static inline u64 global_rt_period(void)
 {
@@ -1130,18 +1143,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 extern const int sched_prio_to_weight[40];
 extern const u32 sched_prio_to_wmult[40];
 
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING    - sched_class::task_waking was called
+ *
+ */
+
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
+
 #define ENQUEUE_WAKEUP		0x01
-#define ENQUEUE_HEAD		0x02
+#define ENQUEUE_RESTORE		0x02
+#define ENQUEUE_MOVE		0x04
+
+#define ENQUEUE_HEAD		0x08
+#define ENQUEUE_REPLENISH	0x10
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x20
 #else
 #define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	0x08
-#define ENQUEUE_RESTORE	0x10
-
-#define DEQUEUE_SLEEP		0x01
-#define DEQUEUE_SAVE		0x02
 
 #define RETRY_TASK		((void *)-1UL)
 
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f..7466a0b 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -4,7 +4,11 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 
+#ifndef CONFIG_SCHED_BFS
 #include "sched.h"
+#else
+#include "bfs_sched.h"
+#endif
 
 /*
  * bump this up when changing the output format or the meaning of an existing
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b0fbc76..70b3b6a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 	if (rq)
 		rq->rq_sched_info.run_delay += delta;
 }
-# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)	do { var = (val); } while (0)
+# define schedstat_enabled()		static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field)	do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt)	do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
+# define schedstat_enabled()		0
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
 # define schedstat_set(var, val)	do { } while (0)
diff --git b/kernel/sched/swait.c b/kernel/sched/swait.c
new file mode 100644
index 0000000..82f0dff
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,123 @@
+#include <linux/sched.h>
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+			     struct lock_class_key *key)
+{
+	raw_spin_lock_init(&q->lock);
+	lockdep_set_class_and_name(&q->lock, key, name);
+	INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+	struct swait_queue *curr;
+
+	if (list_empty(&q->task_list))
+		return;
+
+	curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+	wake_up_process(curr->task);
+	list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+	unsigned long flags;
+
+	if (!swait_active(q))
+		return;
+
+	raw_spin_lock_irqsave(&q->lock, flags);
+	swake_up_locked(q);
+	raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+	struct swait_queue *curr;
+	LIST_HEAD(tmp);
+
+	if (!swait_active(q))
+		return;
+
+	raw_spin_lock_irq(&q->lock);
+	list_splice_init(&q->task_list, &tmp);
+	while (!list_empty(&tmp)) {
+		curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+		wake_up_state(curr->task, TASK_NORMAL);
+		list_del_init(&curr->task_list);
+
+		if (list_empty(&tmp))
+			break;
+
+		raw_spin_unlock_irq(&q->lock);
+		raw_spin_lock_irq(&q->lock);
+	}
+	raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	wait->task = current;
+	if (list_empty(&wait->task_list))
+		list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&q->lock, flags);
+	__prepare_to_swait(q, wait);
+	set_current_state(state);
+	raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+	if (signal_pending_state(state, current))
+		return -ERESTARTSYS;
+
+	prepare_to_swait(q, wait, state);
+
+	return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	__set_current_state(TASK_RUNNING);
+	if (!list_empty(&wait->task_list))
+		list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+
+	if (!list_empty_careful(&wait->task_list)) {
+		raw_spin_lock_irqsave(&q->lock, flags);
+		list_del_init(&wait->task_list);
+		raw_spin_unlock_irqrestore(&q->lock, flags);
+	}
+}
+EXPORT_SYMBOL(finish_swait);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d264f59..28c8e73 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -174,7 +174,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 	if (tsk)
 		return 0;
 
-	td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
+	td = kzalloc_node(sizeof(*td), GFP_KERNEL | ___GFP_TOI_NOTRACK, cpu_to_node(cpu));
 	if (!td)
 		return -ENOMEM;
 	td->cpu = cpu;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e443..8aae49d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 
 	if (preempt_count() == cnt) {
 #ifdef CONFIG_DEBUG_PREEMPT
-		current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+		current->preempt_disable_ip = get_lock_parent_ip();
 #endif
-		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+		trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
 	}
 }
 EXPORT_SYMBOL(__local_bh_disable_ip);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 97715fd..c488777 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,7 +125,13 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
-static int one_hundred = 100;
+static int __maybe_unused one_hundred = 100;
+#ifdef CONFIG_SCHED_BFS
+extern int rr_interval;
+extern int sched_interactive;
+extern int sched_iso_cpu;
+static int __read_mostly one_thousand = 1000;
+#endif
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -260,7 +266,7 @@ static struct ctl_table sysctl_base_table[] = {
 	{ }
 };
 
-#ifdef CONFIG_SCHED_DEBUG
+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS)
 static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_wakeup_granularity_ns;			/* 0 usecs */
@@ -277,6 +283,7 @@ static int max_extfrag_threshold = 1000;
 #endif
 
 static struct ctl_table kern_table[] = {
+#ifndef CONFIG_SCHED_BFS
 	{
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
@@ -350,6 +357,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHEDSTATS
+	{
+		.procname	= "sched_schedstats",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_schedstats,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif /* CONFIG_SCHEDSTATS */
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
 	{
@@ -434,6 +452,7 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
+#endif /* !CONFIG_SCHED_BFS */
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
@@ -505,7 +524,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &latencytop_enabled,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= sysctl_latencytop,
 	},
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -991,6 +1010,35 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_SCHED_BFS
+	{
+		.procname	= "rr_interval",
+		.data		= &rr_interval,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one_thousand,
+	},
+	{
+		.procname	= "interactive",
+		.data		= &sched_interactive,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.procname	= "iso_cpu",
+		.data		= &sched_iso_cpu,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
 	{
 		.procname	= "spin_retry",
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 53fa971..bce3211 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -118,3 +118,4 @@ void task_work_run(void)
 		} while (work);
 	}
 }
+EXPORT_SYMBOL_GPL(task_work_run);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 4008d9f..6931b6e 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -89,7 +89,7 @@ config NO_HZ_IDLE
 config NO_HZ_FULL
 	bool "Full dynticks system (tickless)"
 	# NO_HZ_COMMON dependency
-	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS
 	# We need at least one periodic CPU for timekeeping
 	depends on SMP
 	depends on HAVE_CONTEXT_TRACKING
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index f5e86d2..7a48442 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -448,7 +448,7 @@ static void cleanup_timers(struct list_head *head)
  */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
-	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+	add_device_randomness((const void*) &tsk_seruntime(tsk),
 						sizeof(unsigned long long));
 	cleanup_timers(tsk->cpu_timers);
 
@@ -878,7 +878,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	tsk_expires->virt_exp = expires_to_cputime(expires);
 
 	tsk_expires->sched_exp = check_timers_list(++timers, firing,
-						   tsk->se.sum_exec_runtime);
+						   tsk_seruntime(tsk));
 
 	/*
 	 * Check for the special case thread timers.
@@ -889,7 +889,7 @@ static void check_thread_timers(struct task_struct *tsk,
 			READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
-		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+		    tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
@@ -897,7 +897,7 @@ static void check_thread_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+		if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
@@ -1144,7 +1144,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 		struct task_cputime task_sample;
 
 		task_cputime(tsk, &task_sample.utime, &task_sample.stime);
-		task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
+		task_sample.sum_exec_runtime = tsk_seruntime(tsk);
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
 			return 1;
 	}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b0f86ea..287cf72 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1039,10 +1039,15 @@ static int trace_wakeup_test_thread(void *data)
 {
 	/* Make this a -deadline thread */
 	static const struct sched_attr attr = {
+#ifdef CONFIG_SCHED_BFS
+		/* No deadline on BFS, use RR */
+		.sched_policy = SCHED_RR,
+#else
 		.sched_policy = SCHED_DEADLINE,
 		.sched_runtime = 100000ULL,
 		.sched_deadline = 10000000ULL,
 		.sched_period = 10000000ULL
+#endif
 	};
 	struct wakeup_test_data *x = data;
 
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 975cb49..f8e26ab 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
 	struct mm_struct *mm;
 
-	/* convert pages-usec to Mbyte-usec */
-	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+	/* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
+	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
+	do_div(stats->coremem, 1000 * KB);
+	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
+	do_div(stats->virtmem, 1000 * KB);
 	mm = get_task_mm(p);
 	if (mm) {
 		/* adjust to KB unit */
@@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 static void __acct_update_integrals(struct task_struct *tsk,
 				    cputime_t utime, cputime_t stime)
 {
-	if (likely(tsk->mm)) {
-		cputime_t time, dtime;
-		struct timeval value;
-		unsigned long flags;
-		u64 delta;
-
-		local_irq_save(flags);
-		time = stime + utime;
-		dtime = time - tsk->acct_timexpd;
-		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
-		delta = value.tv_sec;
-		delta = delta * USEC_PER_SEC + value.tv_usec;
-
-		if (delta == 0)
-			goto out;
-		tsk->acct_timexpd = time;
-		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
-		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
-	out:
-		local_irq_restore(flags);
-	}
+	cputime_t time, dtime;
+	u64 delta;
+
+	if (!likely(tsk->mm))
+		return;
+
+	time = stime + utime;
+	dtime = time - tsk->acct_timexpd;
+	/* Avoid division: cputime_t is often in nanoseconds already. */
+	delta = cputime_to_nsecs(dtime);
+
+	if (delta < TICK_NSEC)
+		return;
+
+	tsk->acct_timexpd = time;
+	/*
+	 * Divide by 1024 to avoid overflow, and to avoid division.
+	 * The final unit reported to userspace is Mbyte-usecs,
+	 * the rest of the math is done in xacct_add_tsk.
+	 */
+	tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+	tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
 }
 
 /**
@@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk,
 void acct_update_integrals(struct task_struct *tsk)
 {
 	cputime_t utime, stime;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	task_cputime(tsk, &utime, &stime);
 	__acct_update_integrals(tsk, utime, stime);
+	local_irq_restore(flags);
 }
 
 /**
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8bfd1ac..e0b3031 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -876,7 +876,7 @@ config SCHED_INFO
 
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
-	depends on DEBUG_KERNEL && PROC_FS
+	depends on DEBUG_KERNEL && PROC_FS && !SCHED_BFS
 	select SCHED_INFO
 	help
 	  If you say Y here, additional code will be inserted into the
@@ -1258,7 +1258,7 @@ config TORTURE_TEST
 
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !SCHED_BFS
 	select TORTURE_TEST
 	select SRCU
 	select TASKS_RCU
@@ -1615,6 +1615,7 @@ config LATENCYTOP
 	depends on DEBUG_KERNEL
 	depends on STACKTRACE_SUPPORT
 	depends on PROC_FS
+	depends on !SCHED_BFS
 	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC
 	select KALLSYMS
 	select KALLSYMS_ALL
diff --git a/mm/Makefile b/mm/Makefile
index 2ed4319..e3a53f5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,7 +21,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o vmacache.o \
 			   interval_tree.o list_lru.o workingset.o \
-			   debug.o $(mmu-y)
+			   prfile.o debug.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/debug.c b/mm/debug.c
index f05b2d5..5c6da0f 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -40,6 +40,12 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_MEMORY_FAILURE
 	{1UL << PG_hwpoison,		"hwpoison"	},
 #endif
+#ifdef CONFIG_TOI_INCREMENTAL
+	{1UL << PG_toi_untracked,	"toi_untracked"	},
+	{1UL << PG_toi_ro,		"toi_ro"	},
+	{1UL << PG_toi_cbw,		"toi_cbw"	},
+	{1UL << PG_toi_dirty,		"toi_dirty"	},
+#endif
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
 	{1UL << PG_young,		"young"		},
 	{1UL << PG_idle,		"idle"		},
diff --git a/mm/filemap.c b/mm/filemap.c
index da7a35d..f800f87 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2229,7 +2229,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	int ret = VM_FAULT_LOCKED;
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	vma_file_update_time(vma);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping) {
 		unlock_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 3345dcf..869aa2c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2082,7 +2082,7 @@ static inline int wp_page_reuse(struct mm_struct *mm,
 		}
 
 		if (!page_mkwrite)
-			file_update_time(vma->vm_file);
+			vma_file_update_time(vma);
 	}
 
 	return VM_FAULT_WRITE;
diff --git a/mm/mmap.c b/mm/mmap.c
index 76d1ec2..fdd163e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -290,7 +290,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
-		fput(vma->vm_file);
+		vma_fput(vma);
 	mpol_put(vma_policy(vma));
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
@@ -909,7 +909,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 	if (remove_next) {
 		if (file) {
 			uprobe_munmap(next, next->vm_start, next->vm_end);
-			fput(file);
+			vma_fput(vma);
 		}
 		if (next->anon_vma)
 			anon_vma_merge(vma, next);
@@ -1683,8 +1683,8 @@ out:
 	return addr;
 
 unmap_and_free_vma:
+	vma_fput(vma);
 	vma->vm_file = NULL;
-	fput(file);
 
 	/* Undo any partial mapping done by a device driver. */
 	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
@@ -2479,7 +2479,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_free_mpol;
 
 	if (new->vm_file)
-		get_file(new->vm_file);
+		vma_get_file(new);
 
 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);
@@ -2498,7 +2498,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (new->vm_ops && new->vm_ops->close)
 		new->vm_ops->close(new);
 	if (new->vm_file)
-		fput(new->vm_file);
+		vma_fput(new);
 	unlink_anon_vmas(new);
  out_free_mpol:
 	mpol_put(vma_policy(new));
@@ -2640,7 +2640,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	struct vm_area_struct *vma;
 	unsigned long populate = 0;
 	unsigned long ret = -EINVAL;
-	struct file *file;
 
 	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
 			"See Documentation/vm/remap_file_pages.txt.\n",
@@ -2708,10 +2707,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		}
 	}
 
-	file = get_file(vma->vm_file);
+	vma_get_file(vma);
 	ret = do_mmap_pgoff(vma->vm_file, start, size,
 			prot, flags, pgoff, &populate);
-	fput(file);
+	vma_fput(vma);
 out:
 	up_write(&mm->mmap_sem);
 	if (populate)
@@ -2982,7 +2981,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		if (anon_vma_clone(new_vma, vma))
 			goto out_free_mempol;
 		if (new_vma->vm_file)
-			get_file(new_vma->vm_file);
+			vma_get_file(new_vma);
 		if (new_vma->vm_ops && new_vma->vm_ops->open)
 			new_vma->vm_ops->open(new_vma);
 		vma_link(mm, new_vma, prev, rb_link, rb_parent);
diff --git a/mm/nommu.c b/mm/nommu.c
index fbf6f0f..1a4a06d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -671,7 +671,7 @@ static void __put_nommu_region(struct vm_region *region)
 		up_write(&nommu_region_sem);
 
 		if (region->vm_file)
-			fput(region->vm_file);
+			vmr_fput(region);
 
 		/* IO memory and memory shared directly out of the pagecache
 		 * from ramfs/tmpfs mustn't be released here */
@@ -829,7 +829,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
-		fput(vma->vm_file);
+		vma_fput(vma);
 	put_nommu_region(vma->vm_region);
 	kmem_cache_free(vm_area_cachep, vma);
 }
@@ -1355,7 +1355,7 @@ unsigned long do_mmap(struct file *file,
 					goto error_just_free;
 				}
 			}
-			fput(region->vm_file);
+			vmr_fput(region);
 			kmem_cache_free(vm_region_jar, region);
 			region = pregion;
 			result = start;
@@ -1430,10 +1430,10 @@ error_just_free:
 	up_write(&nommu_region_sem);
 error:
 	if (region->vm_file)
-		fput(region->vm_file);
+		vmr_fput(region);
 	kmem_cache_free(vm_region_jar, region);
 	if (vma->vm_file)
-		fput(vma->vm_file);
+		vma_fput(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 	return ret;
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 62bbf35..2a88657 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -70,7 +70,11 @@ static long ratelimit_pages = 32;
 /*
  * Start background writeback (via writeback threads) at this percentage
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+int dirty_background_ratio = 20;
+#else
 int dirty_background_ratio = 10;
+#endif
 
 /*
  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -87,7 +91,11 @@ int vm_highmem_is_dirtyable;
 /*
  * The generator of dirty data starts writeback at this percentage
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+int vm_dirty_ratio = 50;
+#else
 int vm_dirty_ratio = 20;
+#endif
 
 /*
  * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 629ce64..7b387cc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -62,6 +62,7 @@
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <linux/page_owner.h>
+#include <linux/tuxonice.h>
 #include <linux/kthread.h>
 
 #include <asm/sections.h>
@@ -771,6 +772,12 @@ static inline int free_pages_check(struct page *page)
 	if (unlikely(page->mem_cgroup))
 		bad_reason = "page still charged to cgroup";
 #endif
+        if (unlikely(PageTOI_Untracked(page))) {
+            // Make it writable and included in image if allocated.
+            ClearPageTOI_Untracked(page);
+            // If it gets allocated, it will be dirty from TOI's POV.
+            SetPageTOI_Dirty(page);
+        }
 	if (unlikely(bad_reason)) {
 		bad_page(page, bad_reason, bad_flags);
 		return 1;
@@ -1410,6 +1417,11 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 		struct page *p = page + i;
 		if (unlikely(check_new_page(p)))
 			return 1;
+                if (unlikely(toi_incremental_support() && gfp_flags & ___GFP_TOI_NOTRACK)) {
+                    // Make the page writable if it's protected, and set it to be untracked.
+                    SetPageTOI_Untracked(p);
+                    toi_make_writable(init_mm.pgd, (unsigned long) page_address(p));
+                }
 	}
 
 	set_page_private(page, 0);
diff --git a/mm/percpu.c b/mm/percpu.c
index 998607a..2f040d0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -125,6 +125,7 @@ static int pcpu_nr_units __read_mostly;
 static int pcpu_atom_size __read_mostly;
 static int pcpu_nr_slots __read_mostly;
 static size_t pcpu_chunk_struct_size __read_mostly;
+static int pcpu_pfns;
 
 /* cpus with the lowest and highest unit addresses */
 static unsigned int pcpu_low_unit_cpu __read_mostly;
@@ -1790,6 +1791,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 	/* calculate size_sum and ensure dyn_size is enough for early alloc */
 	size_sum = PFN_ALIGN(static_size + reserved_size +
 			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
+        pcpu_pfns = PFN_DOWN(size_sum);
 	dyn_size = size_sum - static_size - reserved_size;
 
 	/*
@@ -2277,6 +2279,22 @@ void __init percpu_init_late(void)
 	}
 }
 
+#ifdef CONFIG_TOI_INCREMENTAL
+/*
+ * It doesn't matter if we mark an extra page as untracked (and therefore
+ * always save it in incremental images).
+ */
+void toi_mark_per_cpus_pages_untracked(void)
+{
+    int i;
+
+    struct page *page = virt_to_page(pcpu_base_addr);
+
+    for (i = 0; i < pcpu_pfns; i++)
+        SetPageTOI_Untracked(page + i);
+}
+#endif
+
 /*
  * Percpu allocator is initialized early during boot when neither slab or
  * workqueue is available.  Plug async management until everything is up
diff --git b/mm/prfile.c b/mm/prfile.c
new file mode 100644
index 0000000..b323b8a
--- /dev/null
+++ b/mm/prfile.c
@@ -0,0 +1,86 @@
+/*
+ * Mainly for aufs which mmap(2) diffrent file and wants to print different path
+ * in /proc/PID/maps.
+ * Call these functions via macros defined in linux/mm.h.
+ *
+ * See Documentation/filesystems/aufs/design/06mmap.txt
+ *
+ * Copyright (c) 2014 Junjro R. Okajima
+ * Copyright (c) 2014 Ian Campbell
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+
+/* #define PRFILE_TRACE */
+static inline void prfile_trace(struct file *f, struct file *pr,
+			      const char func[], int line, const char func2[])
+{
+#ifdef PRFILE_TRACE
+	if (pr)
+		pr_info("%s:%d: %s, %s\n", func, line, func2,
+			f ? (char *)f->f_path.dentry->d_name.name : "(null)");
+#endif
+}
+
+void vma_do_file_update_time(struct vm_area_struct *vma, const char func[],
+			     int line)
+{
+	struct file *f = vma->vm_file, *pr = vma->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	file_update_time(f);
+	if (f && pr)
+		file_update_time(pr);
+}
+
+struct file *vma_do_pr_or_file(struct vm_area_struct *vma, const char func[],
+			       int line)
+{
+	struct file *f = vma->vm_file, *pr = vma->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	return (f && pr) ? pr : f;
+}
+
+void vma_do_get_file(struct vm_area_struct *vma, const char func[], int line)
+{
+	struct file *f = vma->vm_file, *pr = vma->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	get_file(f);
+	if (f && pr)
+		get_file(pr);
+}
+
+void vma_do_fput(struct vm_area_struct *vma, const char func[], int line)
+{
+	struct file *f = vma->vm_file, *pr = vma->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	fput(f);
+	if (f && pr)
+		fput(pr);
+}
+
+#ifndef CONFIG_MMU
+struct file *vmr_do_pr_or_file(struct vm_region *region, const char func[],
+			       int line)
+{
+	struct file *f = region->vm_file, *pr = region->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	return (f && pr) ? pr : f;
+}
+
+void vmr_do_fput(struct vm_region *region, const char func[], int line)
+{
+	struct file *f = region->vm_file, *pr = region->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	fput(f);
+	if (f && pr)
+		fput(pr);
+}
+#endif /* !CONFIG_MMU */
diff --git a/mm/shmem.c b/mm/shmem.c
index 440e2a7..f64ab5f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1500,7 +1500,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 }
 
 static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
-				     umode_t mode, dev_t dev, unsigned long flags)
+				     umode_t mode, dev_t dev, unsigned long flags, int atomic_copy)
 {
 	struct inode *inode;
 	struct shmem_inode_info *info;
@@ -1521,6 +1521,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		spin_lock_init(&info->lock);
 		info->seals = F_SEAL_SEAL;
 		info->flags = flags & VM_NORESERVE;
+		if (atomic_copy)
+			inode->i_flags |= S_ATOMIC_COPY;
 		INIT_LIST_HEAD(&info->swaplist);
 		simple_xattrs_init(&info->xattrs);
 		cache_no_acl(inode);
@@ -2310,7 +2312,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 	struct inode *inode;
 	int error = -ENOSPC;
 
-	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
+	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE, 0);
 	if (inode) {
 		error = simple_acl_create(dir, inode);
 		if (error)
@@ -2339,7 +2341,7 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct inode *inode;
 	int error = -ENOSPC;
 
-	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
+	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE, 0);
 	if (inode) {
 		error = security_inode_init_security(inode, dir,
 						     NULL,
@@ -2531,7 +2533,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	if (len > PAGE_CACHE_SIZE)
 		return -ENAMETOOLONG;
 
-	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
+	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE, 0);
 	if (!inode)
 		return -ENOSPC;
 
@@ -3010,7 +3012,7 @@ SYSCALL_DEFINE2(memfd_create,
 		goto err_name;
 	}
 
-	file = shmem_file_setup(name, 0, VM_NORESERVE);
+	file = shmem_file_setup(name, 0, VM_NORESERVE, 0);
 	if (IS_ERR(file)) {
 		error = PTR_ERR(file);
 		goto err_fd;
@@ -3101,7 +3103,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_flags |= MS_POSIXACL;
 #endif
 
-	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
+	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE, 0);
 	if (!inode)
 		goto failed;
 	inode->i_uid = sbinfo->uid;
@@ -3357,7 +3359,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
 #define shmem_vm_ops				generic_file_vm_ops
 #define shmem_file_operations			ramfs_file_operations
-#define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
+#define shmem_get_inode(sb, dir, mode, dev, flags, atomic_copy)	ramfs_get_inode(sb, dir, mode, dev)
 #define shmem_acct_size(flags, size)		0
 #define shmem_unacct_size(flags, size)		do {} while (0)
 
@@ -3370,7 +3372,8 @@ static struct dentry_operations anon_ops = {
 };
 
 static struct file *__shmem_file_setup(const char *name, loff_t size,
-				       unsigned long flags, unsigned int i_flags)
+				       unsigned long flags, unsigned int i_flags,
+				       int atomic_copy)
 {
 	struct file *res;
 	struct inode *inode;
@@ -3399,7 +3402,7 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
 	d_set_d_op(path.dentry, &anon_ops);
 
 	res = ERR_PTR(-ENOSPC);
-	inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
+	inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags, atomic_copy);
 	if (!inode)
 		goto put_memory;
 
@@ -3435,9 +3438,9 @@ put_path:
  * @size: size to be set for the file
  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  */
-struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
+struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags, int atomic_copy)
 {
-	return __shmem_file_setup(name, size, flags, S_PRIVATE);
+	return __shmem_file_setup(name, size, flags, S_PRIVATE, atomic_copy);
 }
 
 /**
@@ -3446,9 +3449,9 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon
  * @size: size to be set for the file
  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  */
-struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
+struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags, int atomic_copy)
 {
-	return __shmem_file_setup(name, size, flags, 0);
+	return __shmem_file_setup(name, size, flags, 0, atomic_copy);
 }
 EXPORT_SYMBOL_GPL(shmem_file_setup);
 
@@ -3467,7 +3470,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	 * accessible to the user through its mapping, use S_PRIVATE flag to
 	 * bypass file security, in the same way as shmem_kernel_file_setup().
 	 */
-	file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE);
+	file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE, 0);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
diff --git a/mm/slub.c b/mm/slub.c
index 2a722e1..741e507 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1379,7 +1379,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
 	struct page *page;
 	int order = oo_order(oo);
 
-	flags |= __GFP_NOTRACK;
+	flags |= (__GFP_NOTRACK | ___GFP_TOI_NOTRACK);
 
 	if (node == NUMA_NO_NODE)
 		page = alloc_pages(flags, order);
@@ -3543,7 +3543,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 	struct page *page;
 	void *ptr = NULL;
 
-	flags |= __GFP_COMP | __GFP_NOTRACK;
+	flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_TOI_NOTRACK;
 	page = alloc_kmem_pages_node(node, flags, get_order(size));
 	if (page)
 		ptr = page_address(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d2c3736..98d3483 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -9,6 +9,7 @@
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
+#include <linux/export.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/vmalloc.h>
@@ -43,7 +44,6 @@
 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 				 unsigned char);
 static void free_swap_count_continuations(struct swap_info_struct *);
-static sector_t map_swap_entry(swp_entry_t, struct block_device**);
 
 DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
@@ -719,6 +719,60 @@ swp_entry_t get_swap_page_of_type(int type)
 	return (swp_entry_t) {0};
 }
 
+static unsigned int find_next_to_unuse(struct swap_info_struct *si,
+					unsigned int prev, bool frontswap);
+
+void get_swap_range_of_type(int type, swp_entry_t *start, swp_entry_t *end,
+		unsigned int limit)
+{
+	struct swap_info_struct *si;
+	pgoff_t start_at;
+	unsigned int i;
+
+	*start = swp_entry(0, 0);
+	*end = swp_entry(0, 0);
+	si = swap_info[type];
+	spin_lock(&si->lock);
+	if (si && (si->flags & SWP_WRITEOK)) {
+		atomic_long_dec(&nr_swap_pages);
+		/* This is called for allocating swap entry, not cache */
+		start_at = scan_swap_map(si, 1);
+		if (start_at) {
+			unsigned long stop_at = find_next_to_unuse(si, start_at, 0);
+			if (stop_at > start_at)
+				stop_at--;
+			else
+				stop_at = si->max - 1;
+			if (stop_at - start_at + 1 > limit)
+				stop_at = min_t(unsigned int,
+						start_at + limit - 1,
+						si->max - 1);
+			/* Mark them used */
+			for (i = start_at; i <= stop_at; i++)
+				si->swap_map[i] = 1;
+			/* first page already done above */
+			si->inuse_pages += stop_at - start_at;
+
+			atomic_long_sub(stop_at - start_at, &nr_swap_pages);
+			if (start_at == si->lowest_bit)
+				si->lowest_bit = stop_at + 1;
+			if (stop_at == si->highest_bit)
+				si->highest_bit = start_at - 1;
+			if (si->inuse_pages == si->pages) {
+				si->lowest_bit = si->max;
+				si->highest_bit = 0;
+			}
+			for (i = start_at + 1; i <= stop_at; i++)
+				inc_cluster_info_page(si, si->cluster_info, i);
+			si->cluster_next = stop_at + 1;
+			*start = swp_entry(type, start_at);
+			*end = swp_entry(type, stop_at);
+		} else
+			atomic_long_inc(&nr_swap_pages);
+	}
+	spin_unlock(&si->lock);
+}
+
 static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
@@ -1607,7 +1661,7 @@ static void drain_mmlist(void)
  * Note that the type of this function is sector_t, but it returns page offset
  * into the bdev, not sector offset.
  */
-static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
+sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
 {
 	struct swap_info_struct *sis;
 	struct swap_extent *start_se;
@@ -2738,8 +2792,14 @@ pgoff_t __page_file_index(struct page *page)
 	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 	return swp_offset(swap);
 }
+
 EXPORT_SYMBOL_GPL(__page_file_index);
 
+struct swap_info_struct *get_swap_info_struct(unsigned type)
+{
+	return swap_info[type];
+}
+
 /*
  * add_swap_count_continuation - called when a swap count is duplicated
  * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c712b01..0da5727 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1475,7 +1475,7 @@ static int too_many_isolated(struct zone *zone, int file,
 {
 	unsigned long inactive, isolated;
 
-	if (current_is_kswapd())
+	if (current_is_kswapd() || sc->hibernation_mode)
 		return 0;
 
 	if (!sane_reclaim(sc))
@@ -2345,6 +2345,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	unsigned long pages_for_compaction;
 	unsigned long inactive_lru_pages;
 
+	if (nr_reclaimed && nr_scanned && sc->nr_to_reclaim >= sc->nr_reclaimed)
+		return true;
+
 	/* If not in reclaim/compaction mode, stop */
 	if (!in_reclaim_compaction(sc))
 		return false;
@@ -2659,6 +2662,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	unsigned long total_scanned = 0;
 	unsigned long writeback_threshold;
 	bool zones_reclaimable;
+
+#ifdef CONFIG_FREEZER
+	if (unlikely(pm_freezing && !sc->hibernation_mode))
+		return 0;
+#endif
+
 retry:
 	delayacct_freepages_start();
 
@@ -3540,6 +3549,11 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	if (!populated_zone(zone))
 		return;
 
+#ifdef CONFIG_FREEZER
+	if (pm_freezing)
+		return;
+#endif
+
 	if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
 		return;
 	pgdat = zone->zone_pgdat;
@@ -3565,7 +3579,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
  * LRU order by reclaiming preferentially
  * inactive > active > active referenced > active mapped
  */
-unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
+unsigned long shrink_memory_mask(unsigned long nr_to_reclaim, gfp_t mask)
 {
 	struct reclaim_state reclaim_state;
 	struct scan_control sc = {
@@ -3594,6 +3608,11 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 
 	return nr_reclaimed;
 }
+
+unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
+{
+	return shrink_memory_mask(nr_to_reclaim, GFP_HIGHUSER_MOVABLE);
+}
 #endif /* CONFIG_HIBERNATION */
 
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index fd3ce46..2d49cee 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -8,7 +8,11 @@
 #include <linux/ktime.h>
 #include <linux/string.h>
 #include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
 
+#include <net/tcp.h>
 #include <net/secure_seq.h>
 
 #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
@@ -39,6 +43,102 @@ static u32 seq_scale(u32 seq)
 }
 #endif
 
+#ifdef CONFIG_TCP_STEALTH
+u32 tcp_stealth_sequence_number(struct sock *sk, __be32 *daddr,
+				u32 daddr_size, __be16 dport)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_md5sig_key *md5;
+
+	__u32 sec[MD5_MESSAGE_BYTES / sizeof(__u32)];
+	__u32 i;
+	__u32 tsval = 0;
+
+	__be32 iv[MD5_DIGEST_WORDS] = { 0 };
+	__be32 isn;
+
+	memcpy(iv, daddr, (daddr_size > sizeof(iv)) ? sizeof(iv) : daddr_size);
+
+#ifdef CONFIG_TCP_MD5SIG
+	md5 = tp->af_specific->md5_lookup(sk, sk);
+#else
+	md5 = NULL;
+#endif
+	if (likely(sysctl_tcp_timestamps && !md5) || tp->stealth.saw_tsval)
+		tsval = tp->stealth.mstamp.stamp_jiffies;
+
+	((__be16 *)iv)[2] ^= cpu_to_be16(tp->stealth.integrity_hash);
+	iv[2] ^= cpu_to_be32(tsval);
+	((__be16 *)iv)[6] ^= dport;
+
+	for (i = 0; i < MD5_DIGEST_WORDS; i++)
+		iv[i] = le32_to_cpu(iv[i]);
+	for (i = 0; i < MD5_MESSAGE_BYTES / sizeof(__le32); i++)
+		sec[i] = le32_to_cpu(((__le32 *)tp->stealth.secret)[i]);
+
+	md5_transform(iv, sec);
+
+	isn = cpu_to_be32(iv[0]) ^ cpu_to_be32(iv[1]) ^
+	      cpu_to_be32(iv[2]) ^ cpu_to_be32(iv[3]);
+
+	if (tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY)
+		be32_isn_to_be16_ih(isn) =
+			cpu_to_be16(tp->stealth.integrity_hash);
+
+	return be32_to_cpu(isn);
+}
+EXPORT_SYMBOL(tcp_stealth_sequence_number);
+
+u32 tcp_stealth_do_auth(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcphdr *th = tcp_hdr(skb);
+	__be32 isn = th->seq;
+	__be32 hash;
+	__be32 *daddr;
+	u32 daddr_size;
+
+	tp->stealth.saw_tsval =
+		tcp_parse_tsval_option(&tp->stealth.mstamp.stamp_jiffies, th);
+
+	if (tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN)
+		tp->stealth.integrity_hash =
+			be16_to_cpu(be32_isn_to_be16_ih(isn));
+
+	switch (tp->inet_conn.icsk_inet.sk.sk_family) {
+#if IS_ENABLED(CONFIG_IPV6)
+	case PF_INET6:
+		daddr_size = sizeof(ipv6_hdr(skb)->daddr.s6_addr32);
+		daddr = ipv6_hdr(skb)->daddr.s6_addr32;
+	break;
+#endif
+	case PF_INET:
+		daddr_size = sizeof(ip_hdr(skb)->daddr);
+		daddr = &ip_hdr(skb)->daddr;
+	break;
+	default:
+		pr_err("TCP Stealth: Unknown network layer protocol, stop!\n");
+		return 1;
+	}
+
+	hash = tcp_stealth_sequence_number(sk, daddr, daddr_size, th->dest);
+	cpu_to_be32s(&hash);
+
+	if (tp->stealth.mode & TCP_STEALTH_MODE_AUTH &&
+	    tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN &&
+	    be32_isn_to_be16_av(isn) == be32_isn_to_be16_av(hash))
+		return 0;
+
+	if (tp->stealth.mode & TCP_STEALTH_MODE_AUTH &&
+	    !(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+	    isn == hash)
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL(tcp_stealth_do_auth);
+#endif
+
 #if IS_ENABLED(CONFIG_IPV6)
 __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
 				   __be16 sport, __be16 dport)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7758247..7ccd693 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -653,6 +653,9 @@ choice
 	config DEFAULT_VEGAS
 		bool "Vegas" if TCP_CONG_VEGAS=y
 
+	config DEFAULT_YEAH
+		bool "YeAH" if TCP_CONG_YEAH=y
+
 	config DEFAULT_VENO
 		bool "Veno" if TCP_CONG_VENO=y
 
@@ -683,6 +686,7 @@ config DEFAULT_TCP_CONG
 	default "htcp" if DEFAULT_HTCP
 	default "hybla" if DEFAULT_HYBLA
 	default "vegas" if DEFAULT_VEGAS
+	default "yeah" if DEFAULT_YEAH
 	default "westwood" if DEFAULT_WESTWOOD
 	default "veno" if DEFAULT_VENO
 	default "reno" if DEFAULT_RENO
@@ -700,3 +704,13 @@ config TCP_MD5SIG
 	  on the Internet.
 
 	  If unsure, say N.
+
+config TCP_STEALTH
+	bool "TCP: Stealth TCP socket support"
+	default n
+	---help---
+	  This option enables support for stealth TCP sockets. If you do not
+	  know what this means, you do not need it.
+
+	  If unsure, say N.
+
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 483ffdf..0da50fe 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,6 +269,7 @@
 #include <linux/crypto.h>
 #include <linux/time.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -2318,6 +2319,43 @@ static int tcp_repair_options_est(struct tcp_sock *tp,
 	return 0;
 }
 
+#ifdef CONFIG_TCP_STEALTH
+int tcp_stealth_integrity(__be16 *hash, u8 *secret, u8 *payload, int len)
+{
+	struct scatterlist sg[2];
+	struct crypto_hash *tfm;
+	struct hash_desc desc;
+	__be16 h[MD5_DIGEST_WORDS * 2];
+	int i;
+	int err = 0;
+
+	tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm)) {
+		err = -PTR_ERR(tfm);
+		goto out;
+	}
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	sg_init_table(sg, 2);
+	sg_set_buf(&sg[0], secret, MD5_MESSAGE_BYTES);
+	sg_set_buf(&sg[1], payload, len);
+
+	if (crypto_hash_digest(&desc, sg, MD5_MESSAGE_BYTES + len, (u8 *)h)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	*hash = be16_to_cpu(h[0]);
+	for (i = 1; i < MD5_DIGEST_WORDS * 2; i++)
+		*hash ^= be16_to_cpu(h[i]);
+
+out:
+	crypto_free_hash(tfm);
+	return err;
+}
+#endif
+
 /*
  *	Socket option code for TCP.
  */
@@ -2348,6 +2386,66 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		release_sock(sk);
 		return err;
 	}
+#ifdef CONFIG_TCP_STEALTH
+	case TCP_STEALTH: {
+		u8 secret[MD5_MESSAGE_BYTES] = { 0 };
+
+		val = copy_from_user(secret, optval,
+				     min_t(unsigned int, optlen,
+					   MD5_MESSAGE_BYTES));
+		if (val != 0)
+			return -EFAULT;
+
+		lock_sock(sk);
+		memcpy(tp->stealth.secret, secret, MD5_MESSAGE_BYTES);
+		tp->stealth.mode = TCP_STEALTH_MODE_AUTH;
+		tp->stealth.mstamp.v64 = 0;
+		tp->stealth.saw_tsval = false;
+		release_sock(sk);
+		return err;
+	}
+	case TCP_STEALTH_INTEGRITY: {
+		u8 *payload;
+
+		lock_sock(sk);
+
+		if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+			err = -EOPNOTSUPP;
+			goto stealth_integrity_out_1;
+		}
+
+		if (optlen < 1 || optlen > USHRT_MAX) {
+			err = -EINVAL;
+			goto stealth_integrity_out_1;
+		}
+
+		payload = vmalloc(optlen);
+		if (!payload) {
+			err = -ENOMEM;
+			goto stealth_integrity_out_1;
+		}
+
+		val = copy_from_user(payload, optval, optlen);
+		if (val != 0) {
+			err = -EFAULT;
+			goto stealth_integrity_out_2;
+		}
+
+		err = tcp_stealth_integrity(&tp->stealth.integrity_hash,
+					    tp->stealth.secret, payload,
+					    optlen);
+		if (err)
+			goto stealth_integrity_out_2;
+
+		tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY;
+
+stealth_integrity_out_2:
+		vfree(payload);
+stealth_integrity_out_1:
+		release_sock(sk);
+		return err;
+	}
+#endif
 	default:
 		/* fallthru */
 		break;
@@ -2599,6 +2697,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		tp->notsent_lowat = val;
 		sk->sk_write_space(sk);
 		break;
+#ifdef CONFIG_TCP_STEALTH
+	case TCP_STEALTH_INTEGRITY_LEN:
+		if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+			err = -EOPNOTSUPP;
+		} else if (val < 1 || val > USHRT_MAX) {
+			err = -EINVAL;
+		} else {
+			tp->stealth.integrity_len = val;
+			tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY_LEN;
+		}
+		break;
+#endif
 	default:
 		err = -ENOPROTOOPT;
 		break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3b2c8e9..b3a47bc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -77,6 +77,9 @@
 #include <linux/errqueue.h>
 
 int sysctl_tcp_timestamps __read_mostly = 1;
+#ifdef CONFIG_TCP_STEALTH
+EXPORT_SYMBOL(sysctl_tcp_timestamps);
+#endif
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
 int sysctl_tcp_fack __read_mostly = 1;
@@ -3850,6 +3853,47 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
 	return true;
 }
 
+#ifdef CONFIG_TCP_STEALTH
+/* Parse only the TSVal field of the TCP Timestamp option header.
+ */
+const bool tcp_parse_tsval_option(u32 *tsval, const struct tcphdr *th)
+{
+	int length = (th->doff << 2) - sizeof(*th);
+	const u8 *ptr = (const u8 *)(th + 1);
+
+	/* If the TCP option is too short, we can short cut */
+	if (length < TCPOLEN_TIMESTAMP)
+		return false;
+
+	while (length > 0) {
+		int opcode = *ptr++;
+		int opsize;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return false;
+		case TCPOPT_NOP:
+			length--;
+			continue;
+		case TCPOPT_TIMESTAMP:
+			opsize = *ptr++;
+			if (opsize != TCPOLEN_TIMESTAMP || opsize > length)
+				return false;
+			*tsval = get_unaligned_be32(ptr);
+			return true;
+		default:
+			opsize = *ptr++;
+			if (opsize < 2 || opsize > length)
+				return false;
+		}
+		ptr += opsize - 2;
+		length -= opsize;
+	}
+	return false;
+}
+EXPORT_SYMBOL(tcp_parse_tsval_option);
+#endif
+
 #ifdef CONFIG_TCP_MD5SIG
 /*
  * Parse MD5 Signature option
@@ -4532,6 +4576,31 @@ err:
 
 }
 
+#ifdef CONFIG_TCP_STEALTH
+static int __tcp_stealth_integrity_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u16 hash;
+	__be32 seq = cpu_to_be32(TCP_SKB_CB(skb)->seq - 1);
+	char *data = skb->data + th->doff * 4;
+	int len = skb->len - th->doff * 4;
+
+	if (len < tp->stealth.integrity_len)
+		return 1;
+
+	if (tcp_stealth_integrity(&hash, tp->stealth.secret, data,
+				  tp->stealth.integrity_len))
+		return 1;
+
+	if (be32_isn_to_be16_ih(seq) != cpu_to_be16(hash))
+		return 1;
+
+	tp->stealth.mode &= ~TCP_STEALTH_MODE_INTEGRITY_LEN;
+	return 0;
+}
+#endif
+
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -4541,6 +4610,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
 		goto drop;
 
+#ifdef CONFIG_TCP_STEALTH
+	if (unlikely(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+	    __tcp_stealth_integrity_check(sk, skb)) {
+		tcp_reset(sk);
+		goto drop;
+	}
+#endif
+
 	skb_dst_drop(skb);
 	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
 
@@ -5313,6 +5390,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			int eaten = 0;
 			bool fragstolen = false;
 
+#ifdef CONFIG_TCP_STEALTH
+			if (unlikely(tp->stealth.mode &
+				     TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+			    __tcp_stealth_integrity_check(sk, skb)) {
+				tcp_reset(sk);
+				goto discard;
+			}
+#endif
+
 			if (tp->ucopy.task == current &&
 			    tp->copied_seq == tp->rcv_nxt &&
 			    len - tcp_header_len <= tp->ucopy.len &&
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a7b1a90..b8f3908 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -74,6 +74,7 @@
 #include <net/xfrm.h>
 #include <net/secure_seq.h>
 #include <net/busy_poll.h>
+#include <net/secure_seq.h>
 
 #include <linux/inet.h>
 #include <linux/ipv6.h>
@@ -234,6 +235,21 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	sk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(sk, &rt->dst);
 
+#ifdef CONFIG_TCP_STEALTH
+	/* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as
+	 * early as possible and thus move taking the snapshot of tcp_time_stamp
+	 * here.
+	 */
+	skb_mstamp_get(&tp->stealth.mstamp);
+
+	if (!tp->write_seq && likely(!tp->repair) &&
+	    unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH))
+		tp->write_seq = tcp_stealth_sequence_number(sk,
+					&inet->inet_daddr,
+					sizeof(inet->inet_daddr),
+					usin->sin_port);
+#endif
+
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 							   inet->inet_daddr,
@@ -1376,6 +1392,8 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
  */
 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcphdr *th = tcp_hdr(skb);
 	struct sock *rsk;
 
 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
@@ -1397,6 +1415,15 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (tcp_checksum_complete(skb))
 		goto csum_err;
 
+#ifdef CONFIG_TCP_STEALTH
+	if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin &&
+	    unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH) &&
+	    tcp_stealth_do_auth(sk, skb)) {
+		rsk = sk;
+		goto reset;
+	}
+#endif
+
 	if (sk->sk_state == TCP_LISTEN) {
 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fda379c..20f2812 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -931,6 +931,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
+#ifdef TCP_STEALTH
+	if (unlikely(tcb->tcp_flags & TCPHDR_SYN &&
+		     tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+		skb->skb_mstamp = tp->stealth.mstamp;
+	}
+#endif
+
 	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
 		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
 	else
@@ -3258,7 +3265,15 @@ int tcp_connect(struct sock *sk)
 		return -ENOBUFS;
 
 	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+#ifdef CONFIG_TCP_STEALTH
+	/* The timetamp was already made at the time the ISN was generated
+	 * as we need to know its value in the stealth_tcp_sequence_number()
+	 * function.
+	 */
+	tp->retrans_stamp = tp->stealth.mstamp.stamp_jiffies;
+#else
 	tp->retrans_stamp = tcp_time_stamp;
+#endif
 	tcp_connect_queue_skb(sk, buff);
 	tcp_ecn_send_syn(sk, buff);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5c8c842..62e167e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -62,6 +62,7 @@
 #include <net/inet_common.h>
 #include <net/secure_seq.h>
 #include <net/busy_poll.h>
+#include <net/secure_seq.h>
 
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -278,6 +279,21 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	sk_set_txhash(sk);
 
+#ifdef CONFIG_TCP_STEALTH
+	/* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as
+	 * early as possible and thus move taking the snapshot of tcp_time_stamp
+	 * here.
+	 */
+	skb_mstamp_get(&tp->stealth.mstamp);
+
+	if (!tp->write_seq && likely(!tp->repair) &&
+	    unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH))
+		tp->write_seq = tcp_stealth_sequence_number(sk,
+					sk->sk_v6_daddr.s6_addr32,
+					sizeof(sk->sk_v6_daddr),
+					inet->inet_dport);
+#endif
+
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
 							     sk->sk_v6_daddr.s6_addr32,
@@ -1183,7 +1199,8 @@ out:
 static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct tcp_sock *tp;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcphdr *th = tcp_hdr(skb);
 	struct sk_buff *opt_skb = NULL;
 
 	/* Imagine: socket is IPv6. IPv4 packet arrives,
@@ -1243,6 +1260,13 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (tcp_checksum_complete(skb))
 		goto csum_err;
 
+#ifdef CONFIG_TCP_STEALTH
+	if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin &&
+	    tp->stealth.mode & TCP_STEALTH_MODE_AUTH &&
+	    tcp_stealth_do_auth(sk, skb))
+		goto reset;
+#endif
+
 	if (sk->sk_state == TCP_LISTEN) {
 		struct sock *nsk = tcp_v6_cookie_check(sk, skb);
 
diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index 6fdc97e..20739e4 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -54,8 +54,8 @@ else
 fi
 
 UTS_VERSION="#$VERSION"
-CONFIG_FLAGS=""
-if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
+CONFIG_FLAGS="PCK"
+if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi
 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
 
diff --git b/scripts/tuxonice_output_to_csv.sh b/scripts/tuxonice_output_to_csv.sh
new file mode 100644
index 0000000..b96e680
--- /dev/null
+++ b/scripts/tuxonice_output_to_csv.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+cat $1 | grep "\*TOI\*" | cut -b 22- | sed "s/ /,/g" | sed "s/\.//" | sort -n > $1.tmp
+COLUMNS=$(cat $1.tmp | awk -F ',' ' { print $2 } ' | sort | uniq)
+echo -n "pfn," > $1.tmp2
+for NAME in $COLUMNS; do
+    echo -n "$NAME," >> $1.tmp2
+done
+echo >> $1.tmp2
+FIRST=1
+declare -A data
+while IFS=, read -r pfn column value; do
+    if [ $FIRST -eq 1 ]; then
+        FIRST=0
+        LAST_PFN=$pfn
+    fi
+    if [ $pfn -ne $LAST_PFN ]; then
+        echo -n "$LAST_PFN," >> $1.tmp2;
+        for NAME in $COLUMNS; do
+            echo -n "${data[$NAME]}," >> $1.tmp2
+        done
+        data=( )
+        echo >> $1.tmp2
+        LAST_PFN=$pfn
+    fi
+    if [ -z "$value" ]; then
+        data[$column]=X
+    else
+        data[$column]=$value
+    fi
+done < $1.tmp
+mv $1.tmp2 $1.csv
+rm $1.tmp
+LIBREOFFICE=$(which libreoffice)
+[ -n "$LIBREOFFICE" ] && libreoffice $1.csv &
diff --git a/security/commoncap.c b/security/commoncap.c
index 48071ed..50a1a40 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -1058,12 +1058,14 @@ int cap_mmap_addr(unsigned long addr)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(cap_mmap_addr);
 
 int cap_mmap_file(struct file *file, unsigned long reqprot,
 		  unsigned long prot, unsigned long flags)
 {
 	return 0;
 }
+EXPORT_SYMBOL_GPL(cap_mmap_file);
 
 #ifdef CONFIG_SECURITY
 
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 03c1652..f88c84b 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -7,6 +7,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/cgroup.h>
 #include <linux/ctype.h>
+#include <linux/export.h>
 #include <linux/list.h>
 #include <linux/uaccess.h>
 #include <linux/seq_file.h>
@@ -849,6 +850,7 @@ int __devcgroup_inode_permission(struct inode *inode, int mask)
 	return __devcgroup_check_permission(type, imajor(inode), iminor(inode),
 			access);
 }
+EXPORT_SYMBOL_GPL(__devcgroup_inode_permission);
 
 int devcgroup_inode_mknod(int mode, dev_t dev)
 {
diff --git a/security/keys/big_key.c b/security/keys/big_key.c
index 907c152..83dce8a 100644
--- a/security/keys/big_key.c
+++ b/security/keys/big_key.c
@@ -78,7 +78,7 @@ int big_key_preparse(struct key_preparsed_payload *prep)
 		 *
 		 * TODO: Encrypt the stored data with a temporary key.
 		 */
-		file = shmem_kernel_file_setup("", datalen, 0);
+		file = shmem_kernel_file_setup("", datalen, 0, 0);
 		if (IS_ERR(file)) {
 			ret = PTR_ERR(file);
 			goto error;
diff --git a/security/security.c b/security/security.c
index e8ffd92..51fa026 100644
--- a/security/security.c
+++ b/security/security.c
@@ -433,6 +433,7 @@ int security_path_rmdir(struct path *dir, struct dentry *dentry)
 		return 0;
 	return call_int_hook(path_rmdir, 0, dir, dentry);
 }
+EXPORT_SYMBOL_GPL(security_path_rmdir);
 
 int security_path_unlink(struct path *dir, struct dentry *dentry)
 {
@@ -449,6 +450,7 @@ int security_path_symlink(struct path *dir, struct dentry *dentry,
 		return 0;
 	return call_int_hook(path_symlink, 0, dir, dentry, old_name);
 }
+EXPORT_SYMBOL_GPL(security_path_symlink);
 
 int security_path_link(struct dentry *old_dentry, struct path *new_dir,
 		       struct dentry *new_dentry)
@@ -457,6 +459,7 @@ int security_path_link(struct dentry *old_dentry, struct path *new_dir,
 		return 0;
 	return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry);
 }
+EXPORT_SYMBOL_GPL(security_path_link);
 
 int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
 			 struct path *new_dir, struct dentry *new_dentry,
@@ -484,6 +487,7 @@ int security_path_truncate(struct path *path)
 		return 0;
 	return call_int_hook(path_truncate, 0, path);
 }
+EXPORT_SYMBOL_GPL(security_path_truncate);
 
 int security_path_chmod(struct path *path, umode_t mode)
 {
@@ -491,6 +495,7 @@ int security_path_chmod(struct path *path, umode_t mode)
 		return 0;
 	return call_int_hook(path_chmod, 0, path, mode);
 }
+EXPORT_SYMBOL_GPL(security_path_chmod);
 
 int security_path_chown(struct path *path, kuid_t uid, kgid_t gid)
 {
@@ -498,6 +503,7 @@ int security_path_chown(struct path *path, kuid_t uid, kgid_t gid)
 		return 0;
 	return call_int_hook(path_chown, 0, path, uid, gid);
 }
+EXPORT_SYMBOL_GPL(security_path_chown);
 
 int security_path_chroot(struct path *path)
 {
@@ -583,6 +589,7 @@ int security_inode_readlink(struct dentry *dentry)
 		return 0;
 	return call_int_hook(inode_readlink, 0, dentry);
 }
+EXPORT_SYMBOL_GPL(security_inode_readlink);
 
 int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
 			       bool rcu)
@@ -598,6 +605,7 @@ int security_inode_permission(struct inode *inode, int mask)
 		return 0;
 	return call_int_hook(inode_permission, 0, inode, mask);
 }
+EXPORT_SYMBOL_GPL(security_inode_permission);
 
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -736,6 +744,7 @@ int security_file_permission(struct file *file, int mask)
 
 	return fsnotify_perm(file, mask);
 }
+EXPORT_SYMBOL_GPL(security_file_permission);
 
 int security_file_alloc(struct file *file)
 {
@@ -795,6 +804,7 @@ int security_mmap_file(struct file *file, unsigned long prot,
 		return ret;
 	return ima_file_mmap(file, prot);
 }
+EXPORT_SYMBOL_GPL(security_mmap_file);
 
 int security_mmap_addr(unsigned long addr)
 {
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index db2dd33..65da997 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -97,8 +97,8 @@ static void async_pf_execute(struct work_struct *work)
 	 * This memory barrier pairs with prepare_to_wait's set_current_state()
 	 */
 	smp_mb();
-	if (waitqueue_active(&vcpu->wq))
-		wake_up_interruptible(&vcpu->wq);
+	if (swait_active(&vcpu->wq))
+		swake_up(&vcpu->wq);
 
 	mmput(mm);
 	kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2984737..a84014e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -216,8 +216,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
-	vcpu->halt_poll_ns = 0;
-	init_waitqueue_head(&vcpu->wq);
+	init_swait_queue_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);
 
 	vcpu->pre_pcpu = -1;
@@ -1994,7 +1993,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	ktime_t start, cur;
-	DEFINE_WAIT(wait);
+	DECLARE_SWAITQUEUE(wait);
 	bool waited = false;
 	u64 block_ns;
 
@@ -2019,7 +2018,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	kvm_arch_vcpu_blocking(vcpu);
 
 	for (;;) {
-		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
 		if (kvm_vcpu_check_block(vcpu) < 0)
 			break;
@@ -2028,7 +2027,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		schedule();
 	}
 
-	finish_wait(&vcpu->wq, &wait);
+	finish_swait(&vcpu->wq, &wait);
 	cur = ktime_get();
 
 	kvm_arch_vcpu_unblocking(vcpu);
@@ -2060,11 +2059,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
 	int me;
 	int cpu = vcpu->cpu;
-	wait_queue_head_t *wqp;
+	struct swait_queue_head *wqp;
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (waitqueue_active(wqp)) {
-		wake_up_interruptible(wqp);
+	if (swait_active(wqp)) {
+		swake_up(wqp);
 		++vcpu->stat.halt_wakeup;
 	}
 
@@ -2165,7 +2164,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 				continue;
 			if (vcpu == me)
 				continue;
-			if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+			if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
 				continue;
 			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
 				continue;