summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/power/userland-swsusp.txt149
-rw-r--r--init/do_mounts_initrd.c1
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/power.h14
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/user.c301
-rw-r--r--mm/swapfile.c6
7 files changed, 475 insertions, 7 deletions
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
new file mode 100644
index 00000000000..94058220aaf
--- /dev/null
+++ b/Documentation/power/userland-swsusp.txt
@@ -0,0 +1,149 @@
+Documentation for userland software suspend interface
+ (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+First, the warnings at the beginning of swsusp.txt still apply.
+
+Second, you should read the FAQ in swsusp.txt _now_ if you have not
+done it already.
+
+Now, to use the userland interface for software suspend you need special
+utilities that will read/write the system memory snapshot from/to the
+kernel. Such utilities are available, for example, from
+<http://www.sisk.pl/kernel/utilities/suspend>. You may want to have
+a look at them if you are going to develop your own suspend/resume
+utilities.
+
+The interface consists of a character device providing the open(),
+release(), read(), and write() operations as well as several ioctl()
+commands defined in kernel/power/power.h. The major and minor
+numbers of the device are, respectively, 10 and 231, and they can
+be read from /sys/class/misc/snapshot/dev.
+
+The device can be open either for reading or for writing. If open for
+reading, it is considered to be in the suspend mode. Otherwise it is
+assumed to be in the resume mode. The device cannot be open for reading
+and writing. It is also impossible to have the device open more than once
+at a time.
+
+The ioctl() commands recognized by the device are:
+
+SNAPSHOT_FREEZE - freeze user space processes (the current process is
+ not frozen); this is required for SNAPSHOT_ATOMIC_SNAPSHOT
+ and SNAPSHOT_ATOMIC_RESTORE to succeed
+
+SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE
+
+SNAPSHOT_ATOMIC_SNAPSHOT - create a snapshot of the system memory; the
+ last argument of ioctl() should be a pointer to an int variable,
+ the value of which will indicate whether the call returned after
+ creating the snapshot (1) or after restoring the system memory state
+ from it (0) (after resume the system finds itself finishing the
+ SNAPSHOT_ATOMIC_SNAPSHOT ioctl() again); after the snapshot
+ has been created the read() operation can be used to transfer
+ it out of the kernel
+
+SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the
+ uploaded snapshot image; before calling it you should transfer
+ the system memory snapshot back to the kernel using the write()
+ operation; this call will not succeed if the snapshot
+ image is not available to the kernel
+
+SNAPSHOT_FREE - free memory allocated for the snapshot image
+
+SNAPSHOT_SET_IMAGE_SIZE - set the preferred maximum size of the image
+ (the kernel will do its best to ensure the image size will not exceed
+ this number, but if it turns out to be impossible, the kernel will
+ create the smallest image possible)
+
+SNAPSHOT_AVAIL_SWAP - return the amount of available swap in bytes (the last
+ argument should be a pointer to an unsigned int variable that will
+ contain the result if the call is successful).
+
+SNAPSHOT_GET_SWAP_PAGE - allocate a swap page from the resume partition
+ (the last argument should be a pointer to a loff_t variable that
+ will contain the swap page offset if the call is successful)
+
+SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated with
+ SNAPSHOT_GET_SWAP_PAGE
+
+SNAPSHOT_SET_SWAP_FILE - set the resume partition (the last ioctl() argument
+ should specify the device's major and minor numbers in the old
+ two-byte format, as returned by the stat() function in the .st_rdev
+ member of the stat structure); it is recommended to always use this
+ call, because the code to set the resume partition could be removed from
+ future kernels
+
+The device's read() operation can be used to transfer the snapshot image from
+the kernel. It has the following limitations:
+- you cannot read() more than one virtual memory page at a time
+- read()s accross page boundaries are impossible (ie. if ypu read() 1/2 of
+ a page in the previous call, you will only be able to read()
+ _at_ _most_ 1/2 of the page in the next call)
+
+The device's write() operation is used for uploading the system memory snapshot
+into the kernel. It has the same limitations as the read() operation.
+
+The release() operation frees all memory allocated for the snapshot image
+and all swap pages allocated with SNAPSHOT_GET_SWAP_PAGE (if any).
+Thus it is not necessary to use either SNAPSHOT_FREE or
+SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
+unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
+still frozen when the device is being closed).
+
+Currently it is assumed that the userland utilities reading/writing the
+snapshot image from/to the kernel will use a swap parition, called the resume
+partition, as storage space. However, this is not really required, as they
+can use, for example, a special (blank) suspend partition or a file on a partition
+that is unmounted before SNAPSHOT_ATOMIC_SNAPSHOT and mounted afterwards.
+
+These utilities SHOULD NOT make any assumptions regarding the ordering of
+data within the snapshot image, except for the image header that MAY be
+assumed to start with an swsusp_info structure, as specified in
+kernel/power/power.h. This structure MAY be used by the userland utilities
+to obtain some information about the snapshot image, such as the size
+of the snapshot image, including the metadata and the header itself,
+contained in the .size member of swsusp_info.
+
+The snapshot image MUST be written to the kernel unaltered (ie. all of the image
+data, metadata and header MUST be written in _exactly_ the same amount, form
+and order in which they have been read). Otherwise, the behavior of the
+resumed system may be totally unpredictable.
+
+While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
+structure of the snapshot image is consistent with the information stored
+in the image header. If any inconsistencies are detected,
+SNAPSHOT_ATOMIC_RESTORE will not succeed. Still, this is not a fool-proof
+mechanism and the userland utilities using the interface SHOULD use additional
+means, such as checksums, to ensure the integrity of the snapshot image.
+
+The suspending and resuming utilities MUST lock themselves in memory,
+preferrably using mlockall(), before calling SNAPSHOT_FREEZE.
+
+The suspending utility MUST check the value stored by SNAPSHOT_ATOMIC_SNAPSHOT
+in the memory location pointed to by the last argument of ioctl() and proceed
+in accordance with it:
+1. If the value is 1 (ie. the system memory snapshot has just been
+ created and the system is ready for saving it):
+ (a) The suspending utility MUST NOT close the snapshot device
+ _unless_ the whole suspend procedure is to be cancelled, in
+ which case, if the snapshot image has already been saved, the
+ suspending utility SHOULD destroy it, preferrably by zapping
+ its header. If the suspend is not to be cancelled, the
+ system MUST be powered off or rebooted after the snapshot
+ image has been saved.
+ (b) The suspending utility SHOULD NOT attempt to perform any
+ file system operations (including reads) on the file systems
+ that were mounted before SNAPSHOT_ATOMIC_SNAPSHOT has been
+ called. However, it MAY mount a file system that was not
+ mounted at that time and perform some operations on it (eg.
+ use it for saving the image).
+2. If the value is 0 (ie. the system state has just been restored from
+ the snapshot image), the suspending utility MUST close the snapshot
+ device. Afterwards it will be treated as a regular userland process,
+ so it need not exit.
+
+The resuming utility SHOULD NOT attempt to mount any file systems that could
+be mounted before suspend and SHOULD NOT attempt to perform any operations
+involving such file systems.
+
+For details, please refer to the source code.
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index a05cabd0fd1..405f9031af8 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -56,6 +56,7 @@ static void __init handle_initrd(void)
sys_chroot(".");
mount_devfs_fs ();
+ current->flags |= PF_NOFREEZE;
pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
if (pid > 0) {
while (pid != sys_wait4(-1, NULL, 0, NULL))
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index bb91a061530..8d0af3d37a4 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
obj-y := main.o process.o console.o
obj-$(CONFIG_PM_LEGACY) += pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o
+obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o
obj-$(CONFIG_SUSPEND_SMP) += smp.o
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5d1abffbb9c..42c431c8bdd 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -8,6 +8,7 @@ struct swsusp_info {
int cpus;
unsigned long image_pages;
unsigned long pages;
+ unsigned long size;
} __attribute__((aligned(PAGE_SIZE)));
@@ -65,6 +66,19 @@ extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
int snapshot_image_loaded(struct snapshot_handle *handle);
+#define SNAPSHOT_IOC_MAGIC '3'
+#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
+#define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2)
+#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4)
+#define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5)
+#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
+#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
+#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_IOC_MAXNR 10
+
/**
* The bitmap is used for tracing allocated swap pages
*
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cc349437fb7..0036955357e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -37,6 +37,7 @@
struct pbe *pagedir_nosave;
static unsigned int nr_copy_pages;
static unsigned int nr_meta_pages;
+static unsigned long *buffer;
#ifdef CONFIG_HIGHMEM
unsigned int count_highmem_pages(void)
@@ -389,7 +390,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
free_pagedir(pblist);
pblist = NULL;
} else
- create_pbe_list(pblist, nr_pages);
+ create_pbe_list(pblist, nr_pages);
return pblist;
}
@@ -418,6 +419,7 @@ void swsusp_free(void)
nr_copy_pages = 0;
nr_meta_pages = 0;
pagedir_nosave = NULL;
+ buffer = NULL;
}
@@ -523,6 +525,8 @@ static void init_header(struct swsusp_info *info)
info->cpus = num_online_cpus();
info->image_pages = nr_copy_pages;
info->pages = nr_copy_pages + nr_meta_pages + 1;
+ info->size = info->pages;
+ info->size <<= PAGE_SHIFT;
}
/**
@@ -568,8 +572,6 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
int snapshot_read_next(struct snapshot_handle *handle, size_t count)
{
- static unsigned long *buffer;
-
if (handle->page > nr_meta_pages + nr_copy_pages)
return 0;
if (!buffer) {
@@ -774,7 +776,6 @@ static int create_image(struct snapshot_handle *handle)
int snapshot_write_next(struct snapshot_handle *handle, size_t count)
{
- static unsigned long *buffer;
int error = 0;
if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 00000000000..8cabc405ca1
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,301 @@
+/*
+ * linux/kernel/power/user.c
+ *
+ * This file provides the user space interface for software suspend/resume.
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+
+#include "power.h"
+
+#define SNAPSHOT_MINOR 231
+
+static struct snapshot_data {
+ struct snapshot_handle handle;
+ int swap;
+ struct bitmap_page *bitmap;
+ int mode;
+ char frozen;
+ char ready;
+} snapshot_state;
+
+static atomic_t device_available = ATOMIC_INIT(1);
+
+static int snapshot_open(struct inode *inode, struct file *filp)
+{
+ struct snapshot_data *data;
+
+ if (!atomic_add_unless(&device_available, -1, 0))
+ return -EBUSY;
+
+ if ((filp->f_flags & O_ACCMODE) == O_RDWR)
+ return -ENOSYS;
+
+ nonseekable_open(inode, filp);
+ data = &snapshot_state;
+ filp->private_data = data;
+ memset(&data->handle, 0, sizeof(struct snapshot_handle));
+ if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+ data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+ data->mode = O_RDONLY;
+ } else {
+ data->swap = -1;
+ data->mode = O_WRONLY;
+ }
+ data->bitmap = NULL;
+ data->frozen = 0;
+ data->ready = 0;
+
+ return 0;
+}
+
+static int snapshot_release(struct inode *inode, struct file *filp)
+{
+ struct snapshot_data *data;
+
+ swsusp_free();
+ data = filp->private_data;
+ free_all_swap_pages(data->swap, data->bitmap);
+ free_bitmap(data->bitmap);
+ if (data->frozen) {
+ down(&pm_sem);
+ thaw_processes();
+ enable_nonboot_cpus();
+ up(&pm_sem);
+ }
+ atomic_inc(&device_available);
+ return 0;
+}
+
+static ssize_t snapshot_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *offp)
+{
+ struct snapshot_data *data;
+ ssize_t res;
+
+ data = filp->private_data;
+ res = snapshot_read_next(&data->handle, count);
+ if (res > 0) {
+ if (copy_to_user(buf, data_of(data->handle), res))
+ res = -EFAULT;
+ else
+ *offp = data->handle.offset;
+ }
+ return res;
+}
+
+static ssize_t snapshot_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *offp)
+{
+ struct snapshot_data *data;
+ ssize_t res;
+
+ data = filp->private_data;
+ res = snapshot_write_next(&data->handle, count);
+ if (res > 0) {
+ if (copy_from_user(data_of(data->handle), buf, res))
+ res = -EFAULT;
+ else
+ *offp = data->handle.offset;
+ }
+ return res;
+}
+
+static int snapshot_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ int error = 0;
+ struct snapshot_data *data;
+ loff_t offset, avail;
+
+ if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
+ return -ENOTTY;
+ if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
+ return -ENOTTY;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ data = filp->private_data;
+
+ switch (cmd) {
+
+ case SNAPSHOT_FREEZE:
+ if (data->frozen)
+ break;
+ sys_sync();
+ down(&pm_sem);
+ pm_prepare_console();
+ disable_nonboot_cpus();
+ if (freeze_processes()) {
+ thaw_processes();
+ enable_nonboot_cpus();
+ pm_restore_console();
+ error = -EBUSY;
+ }
+ up(&pm_sem);
+ if (!error)
+ data->frozen = 1;
+ break;
+
+ case SNAPSHOT_UNFREEZE:
+ if (!data->frozen)
+ break;
+ down(&pm_sem);
+ thaw_processes();
+ enable_nonboot_cpus();
+ pm_restore_console();
+ up(&pm_sem);
+ data->frozen = 0;
+ break;
+
+ case SNAPSHOT_ATOMIC_SNAPSHOT:
+ if (data->mode != O_RDONLY || !data->frozen || data->ready) {
+ error = -EPERM;
+ break;
+ }
+ down(&pm_sem);
+ /* Free memory before shutting down devices. */
+ error = swsusp_shrink_memory();
+ if (!error) {
+ error = device_suspend(PMSG_FREEZE);
+ if (!error) {
+ in_suspend = 1;
+ error = swsusp_suspend();
+ device_resume();
+ }
+ }
+ up(&pm_sem);
+ if (!error)
+ error = put_user(in_suspend, (unsigned int __user *)arg);
+ if (!error)
+ data->ready = 1;
+ break;
+
+ case SNAPSHOT_ATOMIC_RESTORE:
+ if (data->mode != O_WRONLY || !data->frozen ||
+ !snapshot_image_loaded(&data->handle)) {
+ error = -EPERM;
+ break;
+ }
+ down(&pm_sem);
+ pm_prepare_console();
+ error = device_suspend(PMSG_FREEZE);
+ if (!error) {
+ error = swsusp_resume();
+ device_resume();
+ }
+ pm_restore_console();
+ up(&pm_sem);
+ break;
+
+ case SNAPSHOT_FREE:
+ swsusp_free();
+ memset(&data->handle, 0, sizeof(struct snapshot_handle));
+ data->ready = 0;
+ break;
+
+ case SNAPSHOT_SET_IMAGE_SIZE:
+ image_size = arg;
+ break;
+
+ case SNAPSHOT_AVAIL_SWAP:
+ avail = count_swap_pages(data->swap, 1);
+ avail <<= PAGE_SHIFT;
+ error = put_user(avail, (loff_t __user *)arg);
+ break;
+
+ case SNAPSHOT_GET_SWAP_PAGE:
+ if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+ error = -ENODEV;
+ break;
+ }
+ if (!data->bitmap) {
+ data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
+ if (!data->bitmap) {
+ error = -ENOMEM;
+ break;
+ }
+ }
+ offset = alloc_swap_page(data->swap, data->bitmap);
+ if (offset) {
+ offset <<= PAGE_SHIFT;
+ error = put_user(offset, (loff_t __user *)arg);
+ } else {
+ error = -ENOSPC;
+ }
+ break;
+
+ case SNAPSHOT_FREE_SWAP_PAGES:
+ if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+ error = -ENODEV;
+ break;
+ }
+ free_all_swap_pages(data->swap, data->bitmap);
+ free_bitmap(data->bitmap);
+ data->bitmap = NULL;
+ break;
+
+ case SNAPSHOT_SET_SWAP_FILE:
+ if (!data->bitmap) {
+ /*
+ * User space encodes device types as two-byte values,
+ * so we need to recode them
+ */
+ if (old_decode_dev(arg)) {
+ data->swap = swap_type_of(old_decode_dev(arg));
+ if (data->swap < 0)
+ error = -ENODEV;
+ } else {
+ data->swap = -1;
+ error = -EINVAL;
+ }
+ } else {
+ error = -EPERM;
+ }
+ break;
+
+ default:
+ error = -ENOTTY;
+
+ }
+
+ return error;
+}
+
+static struct file_operations snapshot_fops = {
+ .open = snapshot_open,
+ .release = snapshot_release,
+ .read = snapshot_read,
+ .write = snapshot_write,
+ .llseek = no_llseek,
+ .ioctl = snapshot_ioctl,
+};
+
+static struct miscdevice snapshot_device = {
+ .minor = SNAPSHOT_MINOR,
+ .name = "snapshot",
+ .fops = &snapshot_fops,
+};
+
+static int __init snapshot_device_init(void)
+{
+ return misc_register(&snapshot_device);
+};
+
+device_initcall(snapshot_device_init);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4d11f9d8466..39aa9d12961 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -428,14 +428,16 @@ int swap_type_of(dev_t device)
{
int i;
- if (!device)
- return -EINVAL;
spin_lock(&swap_lock);
for (i = 0; i < nr_swapfiles; i++) {
struct inode *inode;
if (!(swap_info[i].flags & SWP_WRITEOK))
continue;
+ if (!device) {
+ spin_unlock(&swap_lock);
+ return i;
+ }
inode = swap_info->swap_file->f_dentry->d_inode;
if (S_ISBLK(inode->i_mode) &&
device == MKDEV(imajor(inode), iminor(inode))) {